From 24ad6d6cbeed8428b3a18dcdaa308acdcb0e41c5 Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Wed, 21 Jan 2026 03:44:41 +0200 Subject: [PATCH 01/13] feat: extended columns --- include/data_model/intermediate.h | 130 ++++++++++++++ include/foundation/common.h | 45 ++++- include/join_execution/hash_join.h | 5 +- include/join_execution/join_setup.h | 83 +++++++-- include/join_execution/nested_loop.h | 10 +- .../materialization/construct_intermediate.h | 163 ++++++++++++++++-- include/materialization/materialize.h | 5 +- src/execute.cpp | 18 +- 8 files changed, 415 insertions(+), 44 deletions(-) diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index e0e2667..5f693a7 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -152,6 +152,84 @@ struct column_t { /** @brief Alias for a collection of intermediate columns. */ using Columnar = std::vector; +/** + * @brief Row ID column storing encoded global row IDs. + * + * Parallel structure to column_t but stores uint32_t (encoded table_id + + * row_id). One column per base table participating in joins up to this point. + * Uses same page size and arena allocation as column_t. + * + * @see GlobalRowId for encoding scheme, ExtendedResult for usage. + */ +struct rowid_column_t { + /** @brief Page for row ID storage: fixed array of uint32_t entries. */ + struct alignas(IR_PAGE_SIZE) Page { + uint32_t data[CAP_PER_PAGE]; + }; + + std::vector pages; ///< Pointers to arena-allocated pages. + size_t num_values = 0; ///< Total row ID count across all pages. + uint8_t table_id = 0; ///< Which base table this column tracks. + + rowid_column_t() = default; + + rowid_column_t(rowid_column_t &&other) noexcept + : pages(std::move(other.pages)), num_values(other.num_values), + table_id(other.table_id) { + other.pages.clear(); + other.num_values = 0; + } + + rowid_column_t &operator=(rowid_column_t &&other) noexcept { + if (this != &other) { + pages = std::move(other.pages); + num_values = other.num_values; + table_id = other.table_id; + other.pages.clear(); + other.num_values = 0; + } + return *this; + } + + rowid_column_t(const rowid_column_t &) = delete; + rowid_column_t &operator=(const rowid_column_t &) = delete; + + ~rowid_column_t() = default; + + /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */ + inline uint32_t operator[](size_t idx) const { + return pages[idx >> 12]->data[idx & 0xFFF]; + } + + /** @brief Thread-safe write at idx (requires pages to be set up first). */ + inline void write_at(size_t idx, uint32_t val) { + pages[idx >> 12]->data[idx & 0xFFF] = val; + } + + /** @brief Total row ID count. */ + size_t row_count() const { return num_values; } + + /** @brief Set row count without allocation (for assembly pattern). */ + inline void set_row_count(size_t count) { num_values = count; } + + /** @brief Pre-allocate pages from arena. */ + inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, + size_t count) { + static_assert(sizeof(Page) == + Contest::platform::ChunkSize< + Contest::platform::ChunkType::IR_PAGE>::value, + "Page size mismatch with IR_PAGE chunk size"); + size_t pages_needed = (count + CAP_PER_PAGE - 1) / CAP_PER_PAGE; + pages.reserve(pages_needed); + for (size_t i = 0; i < pages_needed; ++i) { + void *ptr = + arena.alloc_chunk(); + pages.push_back(reinterpret_cast(ptr)); + } + num_values = count; + } +}; + /** * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs. * @see materialize.h @@ -163,4 +241,56 @@ ColumnarTable to_columnar(const Columnar &table, const Plan &plan); namespace Contest { /** @brief Result type for non-root joins (intermediate format). */ using ExecuteResult = std::vector; + +/** + * @brief Extended intermediate result with row ID tracking. + * + * Wraps ExecuteResult with parallel row ID columns that track + * which original scan rows contributed to each intermediate row. + * One rowid_column_t per base table participating in the join tree. + * + * @see GlobalRowId for encoding, construct_intermediate.h for population. + */ +struct ExtendedResult { + ExecuteResult columns; ///< Data columns (value_t). + std::vector row_ids; ///< One per participating table. + std::vector table_ids; ///< Which tables are tracked (sorted). + + ExtendedResult() = default; + + ExtendedResult(ExtendedResult &&) = default; + ExtendedResult &operator=(ExtendedResult &&) = default; + + ExtendedResult(const ExtendedResult &) = delete; + ExtendedResult &operator=(const ExtendedResult &) = delete; + + /** @brief Total row count (from first data column). */ + size_t row_count() const { + return columns.empty() ? 0 : columns[0].row_count(); + } + + /** @brief Find row ID column index for a specific table, or -1 if not + * found. */ + int find_rowid_index(uint8_t tid) const { + for (size_t i = 0; i < table_ids.size(); ++i) { + if (table_ids[i] == tid) + return static_cast(i); + } + return -1; + } + + /** @brief Get row ID column for a specific table, or nullptr if not found. + */ + const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { + int idx = find_rowid_index(tid); + return (idx >= 0) ? &row_ids[idx] : nullptr; + } + + /** @brief Get mutable row ID column for a specific table, or nullptr. */ + mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) { + int idx = find_rowid_index(tid); + return (idx >= 0) ? &row_ids[idx] : nullptr; + } +}; + } /* namespace Contest */ diff --git a/include/foundation/common.h b/include/foundation/common.h index 16c8aa7..192fe08 100644 --- a/include/foundation/common.h +++ b/include/foundation/common.h @@ -125,7 +125,8 @@ class File { } }; -/** @brief Read entire file into string. @throws std::runtime_error on failure. */ +/** @brief Read entire file into string. @throws std::runtime_error on failure. + */ inline std::string read_file(const std::filesystem::path &path) { File f(path, "rb"); ::fseek(f, 0, SEEK_END); @@ -154,7 +155,8 @@ struct DSU { void unite(size_t x, size_t y) { pa[find(x)] = find(y); } }; -/** @brief Mark unreachable code path for compiler optimization (UB if reached). */ +/** @brief Mark unreachable code path for compiler optimization (UB if reached). + */ [[noreturn]] inline void unreachable() { // Uses compiler specific extensions if possible. // Even if no extension is used, undefined behavior is still raised by @@ -164,4 +166,41 @@ struct DSU { #else // GCC, Clang __builtin_unreachable(); #endif -} \ No newline at end of file +} + +namespace Contest { + +/** + * @brief Encoded global row ID: 5-bit table_id + 27-bit row_id. + * + * Supports up to 32 tables and 134M rows per table. + * Used to track original scan table rows through recursive joins. + * + * Encoding: [table_id (5 bits)][row_id (27 bits)] + * - table_id: bits 27-31 + * - row_id: bits 0-26 + */ +struct GlobalRowId { + static constexpr uint32_t TABLE_BITS = 5; + static constexpr uint32_t ROW_BITS = 27; + static constexpr uint32_t TABLE_SHIFT = ROW_BITS; + static constexpr uint32_t ROW_MASK = (1u << ROW_BITS) - 1; + static constexpr uint32_t MAX_TABLES = 1u << TABLE_BITS; // 32 + static constexpr uint32_t MAX_ROWS = 1u << ROW_BITS; // 134,217,728 + + /** @brief Encode table_id and row_id into a single uint32_t. */ + static inline uint32_t encode(uint8_t table_id, uint32_t row_id) { + return (static_cast(table_id) << TABLE_SHIFT) | + (row_id & ROW_MASK); + } + + /** @brief Extract table_id from encoded global row ID. */ + static inline uint8_t table(uint32_t encoded) { + return static_cast(encoded >> TABLE_SHIFT); + } + + /** @brief Extract row_id from encoded global row ID. */ + static inline uint32_t row(uint32_t encoded) { return encoded & ROW_MASK; } +}; + +} // namespace Contest \ No newline at end of file diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h index 0e2b777..b2f1f00 100644 --- a/include/join_execution/hash_join.h +++ b/include/join_execution/hash_join.h @@ -28,6 +28,7 @@ namespace Contest::join { using Contest::ExecuteResult; +using Contest::ExtendedResult; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; @@ -56,8 +57,8 @@ inline UnchainedHashtable build_from_columnar(const JoinInput &input, */ inline UnchainedHashtable build_from_intermediate(const JoinInput &input, size_t attr_idx) { - const auto &result = std::get(input.data); - const auto &column = result[attr_idx]; + const auto &result = std::get(input.data); + const auto &column = result.columns[attr_idx]; size_t row_count = input.row_count(attr_idx); UnchainedHashtable hash_table(row_count); diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h index 299dd65..188873d 100644 --- a/include/join_execution/join_setup.h +++ b/include/join_execution/join_setup.h @@ -22,16 +22,17 @@ namespace Contest::join { using Contest::ExecuteResult; +using Contest::ExtendedResult; using Contest::io::ColumnarReader; /** * @brief Unified abstraction over columnar tables and intermediate results. * - * Stores ColumnarTable* (base scans) or ExecuteResult (child joins). Node + * Stores ColumnarTable* (base scans) or ExtendedResult (child joins). Node * provides output_attrs mapping for column resolution. */ struct JoinInput { - std::variant data; + std::variant data; const PlanNode *node; /**< Provides output_attrs for column mapping. */ uint8_t table_id; /**< Source table ID for provenance tracking. */ @@ -50,12 +51,35 @@ struct JoinInput { auto [actual_col_idx, _] = node->output_attrs[col_idx]; return table->num_rows; } else { - return std::get(data)[col_idx].row_count(); + return std::get(data).columns[col_idx].row_count(); } } /** @brief Number of output columns. */ size_t output_size() const { return node->output_attrs.size(); } + + /** + * @brief Get list of tables whose row IDs are tracked in this input. + * + * For columnar input: returns {table_id}. + * For intermediate: returns the tracked table_ids from ExtendedResult. + */ + std::vector tracked_tables() const { + if (is_columnar()) { + return {table_id}; + } + return std::get(data).table_ids; + } + + /** + * @brief Get row ID column for a specific table. + * @return nullptr for columnar inputs (row IDs encoded on-the-fly). + */ + const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { + if (is_columnar()) + return nullptr; + return std::get(data).get_rowid_column(tid); + } }; /** @@ -159,13 +183,13 @@ inline MatchCollectionMode determine_collection_mode( /** * @brief Creates output columns with provenance metadata from inputs. */ -inline ExecuteResult initialize_output_columns( +inline ExtendedResult initialize_output_columns( const std::vector> &output_attrs, const PlanNode &left_node, const PlanNode &right_node, const JoinInput &left_input, const JoinInput &right_input, size_t estimated_rows) { - ExecuteResult results; - results.reserve(output_attrs.size()); + ExtendedResult results; + results.columns.reserve(output_attrs.size()); size_t left_size = left_input.output_size(); auto set_column_metadata = [](mema::column_t &col, const JoinInput &input, @@ -175,9 +199,9 @@ inline ExecuteResult initialize_output_columns( col.source_table = input.table_id; col.source_column = actual_col_idx; } else { - const auto &result = std::get(input.data); - col.source_table = result[col_idx].source_table; - col.source_column = result[col_idx].source_column; + const auto &result = std::get(input.data); + col.source_table = result.columns[col_idx].source_table; + col.source_column = result.columns[col_idx].source_column; } }; @@ -188,7 +212,7 @@ inline ExecuteResult initialize_output_columns( mema::column_t col; set_column_metadata(col, input, node, local_idx); - results.push_back(std::move(col)); + results.columns.push_back(std::move(col)); } return results; @@ -200,9 +224,10 @@ inline ExecuteResult initialize_output_columns( * prepared flag implements lazy PageIndex construction. */ struct JoinSetup { - ExecuteResult results; /**< Output columns being populated. */ + ExtendedResult results; /**< Output columns + row ID columns. */ ColumnarReader columnar_reader; /**< Page cursor caching for columnar access. */ + std::vector merged_table_ids; /**< Tables tracked in output. */ /** * True after prepare_output_columns called. */ @@ -211,10 +236,41 @@ struct JoinSetup { JoinSetup() : prepared(false) {} }; +/** + * @brief Merge tracked table IDs from build and probe (sorted, unique). + * + * Both input vectors must be sorted. Output is sorted and deduplicated. + */ +inline std::vector +merge_tracked_tables(const std::vector &build_tables, + const std::vector &probe_tables) { + std::vector merged; + merged.reserve(build_tables.size() + probe_tables.size()); + + size_t i = 0, j = 0; + while (i < build_tables.size() && j < probe_tables.size()) { + if (build_tables[i] < probe_tables[j]) { + merged.push_back(build_tables[i++]); + } else if (probe_tables[j] < build_tables[i]) { + merged.push_back(probe_tables[j++]); + } else { + merged.push_back(build_tables[i++]); + j++; // Skip duplicate + } + } + while (i < build_tables.size()) + merged.push_back(build_tables[i++]); + while (j < probe_tables.size()) + merged.push_back(probe_tables[j++]); + + return merged; +} + /** * @brief Initializes JoinSetup with output columns; call before join execution. * * PageIndex construction deferred to prepare_output_columns(). + * Computes merged table IDs from build and probe inputs. */ inline JoinSetup setup_join(const JoinInput &build_input, const JoinInput &probe_input, @@ -229,6 +285,11 @@ setup_join(const JoinInput &build_input, const JoinInput &probe_input, initialize_output_columns(output_attrs, left_node, right_node, left_input, right_input, estimated_rows); + // Compute merged table IDs from build and probe sides + auto build_tables = build_input.tracked_tables(); + auto probe_tables = probe_input.tracked_tables(); + setup.merged_table_ids = merge_tracked_tables(build_tables, probe_tables); + setup.prepared = false; return setup; diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h index e1086d0..7646639 100644 --- a/include/join_execution/nested_loop.h +++ b/include/join_execution/nested_loop.h @@ -28,6 +28,8 @@ */ namespace Contest::join { +using Contest::ExtendedResult; + using Contest::ExecuteResult; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; @@ -69,8 +71,8 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx, } } } else { - const auto &res = std::get(input.data); - const mema::column_t &col = res[attr_idx]; + const auto &res = std::get(input.data); + const mema::column_t &col = res.columns[attr_idx]; size_t count = col.row_count(); for (size_t i = 0; i < count; i++) { const mema::value_t &val = col[i]; @@ -190,8 +192,8 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, } } } else { - const auto &res = std::get(probe_input.data); - const mema::column_t &col = res[probe_attr]; + const auto &res = std::get(probe_input.data); + const mema::column_t &col = res.columns[probe_attr]; size_t count = col.row_count(); size_t start = (t_id * count) / THREAD_COUNT; size_t end = ((t_id + 1) * count) / THREAD_COUNT; diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index 45a4386..090863f 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -24,6 +24,8 @@ namespace Contest::materialize { using Contest::ExecuteResult; +using Contest::ExtendedResult; +using Contest::GlobalRowId; using Contest::io::ColumnarReader; using Contest::join::JoinInput; using Contest::join::MatchCollectionMode; @@ -52,8 +54,8 @@ struct alignas(8) SourceInfo { * @brief Builds SourceInfo for each output column for fast hot-loop lookup. * * @param remapped_attrs Output column specifications (global indexing). - * @param build_input Build side data (ColumnarTable* or ExecuteResult). - * @param probe_input Probe side data (ColumnarTable* or ExecuteResult). + * @param build_input Build side data (ColumnarTable* or ExtendedResult). + * @param probe_input Probe side data (ColumnarTable* or ExtendedResult). * @param build_node PlanNode for build side (contains output_attrs). * @param probe_node PlanNode for probe side (contains output_attrs). * @param build_size Number of columns from build side. @@ -83,31 +85,98 @@ prepare_sources(const std::vector> &remapped_attrs, info.columnar_col = &table->columns[actual_idx]; } else { info.is_columnar = false; - const auto &res = std::get(input.data); - info.intermediate_col = &res[local_idx]; + const auto &res = std::get(input.data); + info.intermediate_col = &res.columns[local_idx]; } sources.push_back(info); } return sources; } +/** + * @brief Precomputed metadata for resolving a row ID column's source. + * + * Determines how to populate each output row ID column: + * - For columnar input: encode GlobalRowId on-the-fly from local index + * - For intermediate input: copy from existing rowid_column_t + * + * @see prepare_rowid_sources() for precomputation logic. + */ +struct alignas(8) RowIdSource { + const mema::rowid_column_t *source_col = + nullptr; /**< Source if from intermediate (else encode). */ + uint8_t table_id = 0; /**< Table ID for encoding/lookup. */ + bool from_build = false; /**< True if from build side, false if probe. */ + bool needs_encode = + false; /**< True if columnar (needs GlobalRowId encode). */ +}; + +/** + * @brief Builds RowIdSource for each output row ID column. + * + * @param merged_table_ids Sorted, unique table IDs to track in output. + * @param build_input Build side data (ColumnarTable* or ExtendedResult). + * @param probe_input Probe side data (ColumnarTable* or ExtendedResult). + * @return Vector of RowIdSource, one per tracked table. + */ +inline std::vector +prepare_rowid_sources(const std::vector &merged_table_ids, + const JoinInput &build_input, + const JoinInput &probe_input) { + std::vector sources; + sources.reserve(merged_table_ids.size()); + + for (uint8_t tid : merged_table_ids) { + RowIdSource src; + src.table_id = tid; + + // Check build side first + auto build_tables = build_input.tracked_tables(); + bool in_build = std::find(build_tables.begin(), build_tables.end(), + tid) != build_tables.end(); + if (in_build) { + src.from_build = true; + if (build_input.is_columnar()) { + src.needs_encode = true; + src.source_col = nullptr; + } else { + src.needs_encode = false; + src.source_col = build_input.get_rowid_column(tid); + } + } else { + // Must be from probe side + src.from_build = false; + if (probe_input.is_columnar()) { + src.needs_encode = true; + src.source_col = nullptr; + } else { + src.needs_encode = false; + src.source_col = probe_input.get_rowid_column(tid); + } + } + sources.push_back(src); + } + return sources; +} + /** * @brief Constructs intermediate results directly from thread-local buffers. * * Each thread iterates its own buffer, avoiding the merge step. Total matches * computed by summing buffer counts. Each thread writes its contiguous portion - * of output pages. + * of output pages. Also populates row ID columns for provenance tracking. * * @tparam Mode Collection mode for compile-time specialization. * @param buffers Vector of ThreadLocalMatchBuffer from probe. - * @param build_input Build side data (ColumnarTable* or ExecuteResult). - * @param probe_input Probe side data (ColumnarTable* or ExecuteResult). + * @param build_input Build side data (ColumnarTable* or ExtendedResult). + * @param probe_input Probe side data (ColumnarTable* or ExtendedResult). * @param remapped_attrs Output column specifications (global indexing). * @param build_node PlanNode for build side output_attrs mapping. * @param probe_node PlanNode for probe side output_attrs mapping. * @param build_size Number of output columns from build side. * @param columnar_reader ColumnarReader with Cursor caching for page access. - * @param results Pre-initialized ExecuteResult, populated in-place. + * @param results Pre-initialized ExtendedResult, populated in-place. + * @param merged_table_ids Sorted, unique table IDs to track in output. */ template inline void construct_intermediate_from_buffers( @@ -115,7 +184,8 @@ inline void construct_intermediate_from_buffers( const JoinInput &build_input, const JoinInput &probe_input, const std::vector> &remapped_attrs, const PlanNode &build_node, const PlanNode &probe_node, size_t build_size, - ColumnarReader &columnar_reader, ExecuteResult &results) { + ColumnarReader &columnar_reader, ExtendedResult &results, + const std::vector &merged_table_ids) { // Compute total matches and per-buffer start offsets size_t total_matches = 0; @@ -130,25 +200,38 @@ inline void construct_intermediate_from_buffers( auto sources = prepare_sources(remapped_attrs, build_input, probe_input, build_node, probe_node, build_size); + auto rowid_sources = + prepare_rowid_sources(merged_table_ids, build_input, probe_input); const size_t num_threads = THREAD_COUNT; const size_t num_cols = sources.size(); + const size_t num_rowid_cols = rowid_sources.size(); - // Pre-size page vectors for each column + // Pre-size page vectors for each data column using Page = mema::column_t::Page; + using RowIdPage = mema::rowid_column_t::Page; size_t total_pages_needed = (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; for (size_t c = 0; c < num_cols; ++c) { - auto &col = results[c]; + auto &col = results.columns[c]; col.pages.resize(total_pages_needed); col.set_row_count(total_matches); } + // Setup row ID columns in results + results.table_ids = merged_table_ids; + results.row_ids.resize(num_rowid_cols); + for (size_t r = 0; r < num_rowid_cols; ++r) { + results.row_ids[r].table_id = merged_table_ids[r]; + results.row_ids[r].pages.resize(total_pages_needed); + results.row_ids[r].set_row_count(total_matches); + } + // Parallel page allocation - each thread allocates its own pages worker_pool().execute([&](size_t t) { for (size_t c = 0; c < num_cols; ++c) { - auto &col = results[c]; + auto &col = results.columns[c]; for (size_t p = t; p < total_pages_needed; p += num_threads) { void *ptr = Contest::platform::get_arena(t) @@ -156,6 +239,16 @@ inline void construct_intermediate_from_buffers( col.pages[p] = reinterpret_cast(ptr); } } + // Allocate row ID pages + for (size_t r = 0; r < num_rowid_cols; ++r) { + auto &rid_col = results.row_ids[r]; + for (size_t p = t; p < total_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + rid_col.pages[p] = reinterpret_cast(ptr); + } + } }); // Parallel: each thread processes its own buffer @@ -170,9 +263,10 @@ inline void construct_intermediate_from_buffers( size_t start = buffer_starts[t]; Contest::ColumnarReader::Cursor cursor; + // Process data columns for (size_t c = 0; c < num_cols; ++c) { const auto &src = sources[c]; - auto &dest_col = results[c]; + auto &dest_col = results.columns[c]; auto left_range = buf.left_range(); auto right_range = buf.right_range(); @@ -211,6 +305,49 @@ inline void construct_intermediate_from_buffers( } } } + + // Process row ID columns + for (size_t r = 0; r < num_rowid_cols; ++r) { + const auto &rid_src = rowid_sources[r]; + auto &dest_rid_col = results.row_ids[r]; + + auto left_range = buf.left_range(); + auto right_range = buf.right_range(); + + if (rid_src.from_build) { + size_t k = start; + if (rid_src.needs_encode) { + // Columnar build: encode GlobalRowId on-the-fly + for (uint32_t local_idx : left_range) { + dest_rid_col.write_at( + k++, + GlobalRowId::encode(rid_src.table_id, local_idx)); + } + } else { + // Intermediate build: copy from source row ID column + const auto &src_col = *rid_src.source_col; + for (uint32_t local_idx : left_range) { + dest_rid_col.write_at(k++, src_col[local_idx]); + } + } + } else { + size_t k = start; + if (rid_src.needs_encode) { + // Columnar probe: encode GlobalRowId on-the-fly + for (uint32_t local_idx : right_range) { + dest_rid_col.write_at( + k++, + GlobalRowId::encode(rid_src.table_id, local_idx)); + } + } else { + // Intermediate probe: copy from source row ID column + const auto &src_col = *rid_src.source_col; + for (uint32_t local_idx : right_range) { + dest_rid_col.write_at(k++, src_col[local_idx]); + } + } + } + } }); } diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h index e154e93..6d4a3be 100644 --- a/include/materialization/materialize.h +++ b/include/materialization/materialize.h @@ -25,6 +25,7 @@ namespace Contest::materialize { using Contest::ExecuteResult; +using Contest::ExtendedResult; using Contest::io::ColumnarReader; using Contest::join::JoinInput; using Contest::join::MatchCollectionMode; @@ -191,8 +192,8 @@ inline void materialize_single_column_from_buffers( auto [actual_idx, _] = node.output_attrs[local_idx]; col_source = &table->columns[actual_idx]; } else { - const auto &res = std::get(input.data); - inter_source = &res[local_idx]; + const auto &res = std::get(input.data); + inter_source = &res.columns[local_idx]; } auto reader = [&](uint32_t rid, ColumnarReader::Cursor &cursor, diff --git a/src/execute.cpp b/src/execute.cpp index c5a3eed..a9589ad 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -49,10 +49,10 @@ using materialize::create_empty_result; using materialize::materialize_from_buffers; /** - * @brief Result variant: ExecuteResult (intermediate, value_t columns) or + * @brief Result variant: ExtendedResult (intermediate, with row ID tracking) or * ColumnarTable (final output per contest API). */ -using JoinResult = std::variant; +using JoinResult = std::variant; /** * @brief Recursive join execution with timing. @@ -69,7 +69,7 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, * @brief Resolve plan node to JoinInput. * * ScanNode -> non-owning ColumnarTable*; JoinNode -> recursive execution - * returning owned ExecuteResult. Implements depth-first traversal. + * returning owned ExtendedResult. Implements depth-first traversal. * * @param plan Query plan. * @param node_idx Node index to resolve. @@ -87,7 +87,7 @@ JoinInput resolve_join_input(const Plan &plan, size_t node_idx, input.table_id = scan->base_table_id; } else { auto result = execute_impl(plan, node_idx, false, stats); - input.data = std::get(std::move(result)); + input.data = std::get(std::move(result)); input.table_id = 0; } return input; @@ -128,9 +128,9 @@ JoinResult execute_join_with_mode( config.probe_attr); } else { const auto &probe_result = - std::get(probe_input.data); + std::get(probe_input.data); match_buffers = probe_intermediate( - *hash_table, probe_result[config.probe_attr]); + *hash_table, probe_result.columns[config.probe_attr]); } auto probe_end = std::chrono::high_resolution_clock::now(); stats.hash_join_probe_ms += @@ -175,7 +175,7 @@ JoinResult execute_join_with_mode( construct_intermediate_from_buffers( match_buffers, build_input, probe_input, config.remapped_attrs, build_node, probe_node, build_input.output_size(), - columnar_reader, setup.results); + columnar_reader, setup.results, setup.merged_table_ids); } auto inter_end = std::chrono::high_resolution_clock::now(); stats.intermediate_ms += @@ -203,7 +203,7 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, auto &node = plan.nodes[node_idx]; if (!std::holds_alternative(node.data)) { - return ExecuteResult{}; + return ExtendedResult{}; } const auto &join = std::get(node.data); @@ -288,7 +288,7 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, } // Should never reach here, but satisfy compiler - return ExecuteResult{}; + return ExtendedResult{}; } /** From d1a6bfcd14de66aa3d29213f56f8b3a40bb9b83e Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Wed, 21 Jan 2026 23:51:01 +0200 Subject: [PATCH 02/13] feat: initial deferred materialization --- .gitignore | 3 + CMakeLists.txt | 7 + include/data_access/columnar_reader.h | 124 ++++- include/data_model/deferred_intermediate.h | 169 +++++++ include/data_model/deferred_plan.h | 142 ++++++ include/data_model/plan.h | 21 +- include/materialization/construct_deferred.h | 446 ++++++++++++++++++ .../materialization/materialize_deferred.h | 439 +++++++++++++++++ src/analyze_plan.cpp | 311 ++++++++++++ src/execute.cpp | 386 ++++++++++++++- 10 files changed, 2029 insertions(+), 19 deletions(-) create mode 100644 include/data_model/deferred_intermediate.h create mode 100644 include/data_model/deferred_plan.h create mode 100644 include/materialization/construct_deferred.h create mode 100644 include/materialization/materialize_deferred.h create mode 100644 src/analyze_plan.cpp diff --git a/.gitignore b/.gitignore index 12c22e9..f1c8719 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,6 @@ compile_commands.json /env/ script.py *.md +/build_deferred +/build_debug +/build_eager diff --git a/CMakeLists.txt b/CMakeLists.txt index dc0739a..2621d56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,13 @@ FetchContent_MakeAvailable(fmtlib) set(ENABLE_SANITIZER OFF) set(ENABLE_UBSAN OFF) + +# Deferred materialization: only materialize join keys, defer other columns +option(USE_DEFERRED_MATERIALIZATION "Enable deferred column materialization" OFF) +if(USE_DEFERRED_MATERIALIZATION) + message(STATUS "Deferred materialization ENABLED") + add_compile_definitions(USE_DEFERRED_MATERIALIZATION) +endif() if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le") message("Disabling jemalloc extension of DuckDB on Power.") set(SKIP_EXTENSIONS jemalloc) diff --git a/include/data_access/columnar_reader.h b/include/data_access/columnar_reader.h index 2074498..e143c95 100644 --- a/include/data_access/columnar_reader.h +++ b/include/data_access/columnar_reader.h @@ -275,10 +275,15 @@ class ColumnarReader { /* Dense INT32 fast path: O(1) arithmetic lookup, bypasses cursor */ if (data_type == DataType::INT32) { - const PageIndex &page_index = IsBuild ? build_page_indices[col_idx] + size_t pidx_size = + IsBuild ? build_page_indices.size() : probe_page_indices.size(); + if (SPC_LIKELY(col_idx < pidx_size)) { + const PageIndex &page_index = IsBuild + ? build_page_indices[col_idx] : probe_page_indices[col_idx]; - if (SPC_LIKELY(page_index.is_dense_int32)) { - return mema::value_t{read_dense_int32(page_index, row_id)}; + if (SPC_LIKELY(page_index.is_dense_int32)) { + return mema::value_t{read_dense_int32(page_index, row_id)}; + } } } @@ -291,10 +296,10 @@ class ColumnarReader { global_probe_version.load(std::memory_order_relaxed); } - if (SPC_LIKELY(cursor.version == current_version && - col_idx == cursor.cached_col && - row_id >= cursor.cached_start && - row_id < cursor.cached_end)) { + bool cache_hit = + cursor.version == current_version && col_idx == cursor.cached_col && + row_id >= cursor.cached_start && row_id < cursor.cached_end; + if (SPC_LIKELY(cache_hit)) { uint32_t local_row = row_id - cursor.cached_start; if (SPC_LIKELY(cursor.is_dense)) { if (data_type == DataType::INT32) { @@ -313,9 +318,11 @@ class ColumnarReader { } /* sequential access optimization: skip binary search for next page */ + size_t pidx_count = + IsBuild ? build_page_indices.size() : probe_page_indices.size(); if (SPC_LIKELY(cursor.version == current_version && col_idx == cursor.cached_col && - row_id == cursor.cached_end)) { + row_id == cursor.cached_end && col_idx < pidx_count)) { const PageIndex &page_index = IsBuild ? build_page_indices[col_idx] : probe_page_indices[col_idx]; size_t next_page = cursor.cached_page + 1; @@ -384,6 +391,12 @@ class ColumnarReader { Cursor &cursor, uint64_t current_version) const { + size_t pidx_size = + IsBuild ? build_page_indices.size() : probe_page_indices.size(); + if (SPC_UNLIKELY(col_idx >= pidx_size)) { + // No page index prepared - use direct page read + return read_value_direct(column, row_id, data_type); + } const PageIndex &page_index = IsBuild ? build_page_indices[col_idx] : probe_page_indices[col_idx]; size_t page_num = page_index.find_page(row_id); @@ -407,6 +420,18 @@ class ColumnarReader { } } + /** + * @brief Direct value read bypassing page index cache. + * + * Used for deferred column resolution when reading from base tables + * that don't have prepared page indices. O(n) page scan per read. + */ + inline mema::value_t read_value_direct_public(const Column &column, + uint32_t row_id, + DataType data_type) const { + return read_value_direct(column, row_id, data_type); + } + inline const PageIndex &get_build_page_index(size_t col_idx) const { return build_page_indices[col_idx]; } @@ -428,6 +453,89 @@ class ColumnarReader { return reinterpret_cast(page_data + 4)[local_row]; } + /** + * @brief Direct value read without prepared page index. + * + * Used when page indices aren't available (e.g., reading base tables + * during deferred resolution). O(n) page scan - slower than cached path. + */ + inline mema::value_t read_value_direct(const Column &column, + uint32_t row_id, + DataType data_type) const { + // Linear scan to find page containing row_id + uint32_t cumulative = 0; + for (size_t page_num = 0; page_num < column.pages.size(); ++page_num) { + auto *page_data = column.pages[page_num]->data; + auto num_rows = *reinterpret_cast(page_data); + auto num_values = + *reinterpret_cast(page_data + 2); + + // Handle special pages + if (num_rows == 0xffff) { + // Long string page - single row + if (row_id == cumulative) { + return mema::value_t::encode_string( + static_cast(page_num), + mema::value_t::LONG_STRING_OFFSET); + } + cumulative += 1; + continue; + } + if (num_rows == 0xfffe) { + // Skip special marker pages + continue; + } + + if (row_id < cumulative + num_rows) { + // Found the page + uint32_t local_row = row_id - cumulative; + bool is_dense = (num_rows == num_values); + const auto *data_ptr = + reinterpret_cast(page_data + 4); + + if (is_dense) { + if (data_type == DataType::INT32) { + return mema::value_t{data_ptr[local_row]}; + } else { + return mema::value_t::encode_string( + static_cast(page_num), + static_cast(local_row)); + } + } else { + // Sparse page - check bitmap + size_t bitmap_size = (num_rows + 7) / 8; + const auto *bitmap_ptr = reinterpret_cast( + page_data + PAGE_SIZE - bitmap_size); + + bool is_valid = + bitmap_ptr[local_row >> 3] & (1u << (local_row & 7)); + if (!is_valid) { + return mema::value_t{mema::value_t::NULL_VALUE}; + } + + // Compute data index via popcount + uint32_t data_idx = 0; + for (uint32_t i = 0; i < local_row; ++i) { + if (bitmap_ptr[i >> 3] & (1u << (i & 7))) { + data_idx++; + } + } + + if (data_type == DataType::INT32) { + return mema::value_t{data_ptr[data_idx]}; + } else { + return mema::value_t::encode_string( + static_cast(page_num), + static_cast(data_idx)); + } + } + } + cumulative += num_rows; + } + // Row not found - return NULL + return mema::value_t{mema::value_t::NULL_VALUE}; + } + /** @brief Reads from sparse pages using bitmap and popcount. */ inline mema::value_t read_sparse(uint32_t local_row, DataType data_type, const Cursor &cursor) const { diff --git a/include/data_model/deferred_intermediate.h b/include/data_model/deferred_intermediate.h new file mode 100644 index 0000000..0c16a13 --- /dev/null +++ b/include/data_model/deferred_intermediate.h @@ -0,0 +1,169 @@ +/** + * @file deferred_intermediate.h + * @brief Lightweight intermediate result for deferred materialization. + * + * DeferredResult stores only materialized columns (join keys) plus row ID + * provenance columns. Deferred columns are resolved at final materialization + * by following row IDs back to base tables. + * + * @see deferred_plan.h for DeferredJoinNode with column decisions. + * @see construct_deferred.h for building DeferredResult. + * @see materialize_deferred.h for final resolution. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace Contest { + +/** + * @brief Lightweight intermediate result with only join keys materialized. + * + * Unlike ExtendedResult which stores all projected columns, DeferredResult + * stores only columns marked MATERIALIZE (typically just the parent's join + * key). All other columns are resolved at final materialization using row ID + * provenance. + * + * Memory savings: For a join projecting N columns where only 1 is a join key, + * DeferredResult uses ~1/N the memory of ExtendedResult for data columns. + * + * @see DeferredColumnInfo for materialization decisions. + * @see DeferredJoinNode for column provenance tracking. + */ +struct DeferredResult { + /// Only columns marked MATERIALIZE (typically 1 join key). + std::vector materialized; + + /// Map: original column index → index in materialized (nullopt if + /// deferred). + std::vector> materialized_map; + + /// Row ID tracking for provenance (same as ExtendedResult). + std::vector row_ids; + + /// Which base tables are tracked (sorted). + std::vector table_ids; + + /// Reference to node info for column provenance resolution. + const DeferredJoinNode *node_info = nullptr; + + /// Total row count. + size_t num_rows = 0; + + DeferredResult() = default; + DeferredResult(DeferredResult &&) = default; + DeferredResult &operator=(DeferredResult &&) = default; + DeferredResult(const DeferredResult &) = delete; + DeferredResult &operator=(const DeferredResult &) = delete; + + /** @brief Total row count. */ + size_t row_count() const { return num_rows; } + + /** @brief Check if column was materialized (not deferred). */ + bool is_materialized(size_t orig_idx) const { + return orig_idx < materialized_map.size() && + materialized_map[orig_idx].has_value(); + } + + /** @brief Get materialized column, or nullptr if deferred. */ + const mema::column_t *get_materialized(size_t orig_idx) const { + if (!is_materialized(orig_idx)) + return nullptr; + return &materialized[*materialized_map[orig_idx]]; + } + + /** @brief Find row ID column index for a table, or -1 if not found. */ + int find_rowid_index(uint8_t tid) const { + for (size_t i = 0; i < table_ids.size(); ++i) { + if (table_ids[i] == tid) + return static_cast(i); + } + return -1; + } + + /** @brief Get row ID column for a table, or nullptr if not found. */ + const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { + int idx = find_rowid_index(tid); + return (idx >= 0) ? &row_ids[idx] : nullptr; + } + + /** @brief Get mutable row ID column for a table, or nullptr. */ + mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) { + int idx = find_rowid_index(tid); + return (idx >= 0) ? &row_ids[idx] : nullptr; + } +}; + +/** + * @brief Input abstraction for deferred execution path. + * + * Similar to JoinInput but works with DeferredResult instead of ExtendedResult. + * Provides uniform interface for columnar (base table) and deferred + * intermediate data sources. + */ +struct DeferredInput { + /// Either base table pointer or owned DeferredResult. + std::variant data; + + /// Original plan node for output_attrs mapping. + const PlanNode *node = nullptr; + + /// Deferred plan node for materialization decisions. + const DeferredNode *deferred_node = nullptr; + + /// Base table ID (for columnar inputs). + uint8_t table_id = 0; + + /** @brief True if data is columnar (base table). */ + bool is_columnar() const { + return std::holds_alternative(data); + } + + /** @brief Row count for join key column. */ + size_t row_count(size_t col_idx) const { + if (is_columnar()) { + const auto *table = std::get(data); + return table->num_rows; + } + return std::get(data).row_count(); + } + + /** @brief Total row count. */ + size_t row_count() const { + if (is_columnar()) { + const auto *table = std::get(data); + return table->num_rows; + } + return std::get(data).row_count(); + } + + /** @brief Number of output columns. */ + size_t output_size() const { + if (node) + return node->output_attrs.size(); + return 0; + } + + /** @brief Get list of tracked table IDs. */ + std::vector tracked_tables() const { + if (is_columnar()) { + return {table_id}; + } + return std::get(data).table_ids; + } + + /** @brief Get row ID column for a table. */ + const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { + if (is_columnar()) + return nullptr; + return std::get(data).get_rowid_column(tid); + } +}; + +} // namespace Contest diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h new file mode 100644 index 0000000..46daa23 --- /dev/null +++ b/include/data_model/deferred_plan.h @@ -0,0 +1,142 @@ +/** + * @file deferred_plan.h + * @brief Analyzed plan with materialization decisions for deferred execution. + * + * DeferredPlan mirrors the original Plan structure but includes pre-computed + * decisions about which columns to materialize eagerly (join keys) vs defer + * until final output. Each DeferredJoinNode tracks column provenance back to + * base tables for efficient deferred resolution. + * + * @see analyze_plan.cpp for the analysis algorithm. + * @see deferred_intermediate.h for the runtime result format. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace Contest { + +/** + * @brief Materialization decision for an output column. + * + * MATERIALIZE: Column is needed as a join key by parent - materialize eagerly. + * DEFER: Column only needed at final output - defer until root materialization. + */ +enum class ColumnResolution : uint8_t { MATERIALIZE, DEFER }; + +/** + * @brief Tracks the base table origin of a column for deferred resolution. + * + * Used to resolve deferred columns at final materialization by looking up + * the original value in the base table using row ID provenance. + */ +struct ColumnProvenance { + uint8_t base_table_id; ///< Index into Plan::inputs. + uint8_t base_column_idx; ///< Column index within the base table. +}; + +/** + * @brief Complete metadata for an output column in a deferred join. + * + * Combines materialization decision, provenance tracking, and child source + * information for efficient intermediate construction and final resolution. + */ +struct DeferredColumnInfo { + size_t original_idx; ///< Index in node's output_attrs. + DataType type; ///< INT32 or VARCHAR. + + ColumnResolution resolution; ///< MATERIALIZE or DEFER. + ColumnProvenance provenance; ///< Base table source for deferred resolution. + + bool from_left; ///< True if from left child, false if right. + size_t child_output_idx; ///< Index in child's output_attrs. +}; + +/** + * @brief Analyzed scan node for deferred execution. + * + * Wraps a ScanNode with output attribute information. + */ +struct DeferredScanNode { + size_t node_idx; ///< Index in original Plan::nodes. + uint8_t base_table_id; ///< Index into Plan::inputs. + std::vector> output_attrs; ///< Projected cols. +}; + +/** + * @brief Analyzed join node with pre-computed materialization decisions. + * + * Contains all information needed for deferred execution: + * - Which columns to materialize eagerly (join keys for parent) + * - Column provenance for deferred resolution + * - Pre-computed match collection mode + * - Table IDs tracked through this node + */ +struct DeferredJoinNode { + size_t node_idx; ///< Index in original Plan::nodes. + + size_t left_child_idx; ///< Left child index in Plan::nodes. + size_t right_child_idx; ///< Right child index in Plan::nodes. + size_t left_join_attr; ///< Join key index in left child's output. + size_t right_join_attr; ///< Join key index in right child's output. + + /// Original output attributes (global indexing). + std::vector> output_attrs; + + /// Per-column materialization decisions and provenance. + std::vector columns; + + /// Pre-computed collection mode (assumes build=left; flip if build=right). + join::MatchCollectionMode base_collection_mode; + + /// Sorted table IDs tracked through this node (union of children). + std::vector tracked_table_ids; + + /// Column index that parent needs as join key (nullopt if root). + std::optional parent_join_key_idx; + + /// True if this is the root node. + bool is_root; +}; + +/** + * @brief Plan node variant for deferred execution. + */ +using DeferredNode = std::variant; + +/** + * @brief Analyzed plan with materialization decisions. + * + * Mirrors Plan structure but includes pre-computed decisions for deferred + * materialization. The original_plan pointer provides access to base tables + * for value resolution. + */ +struct DeferredPlan { + std::vector nodes; ///< Analyzed nodes (same indices as Plan). + size_t root; ///< Root node index. + const Plan *original_plan; ///< Non-owning reference to original plan. + + const DeferredNode &operator[](size_t idx) const { return nodes[idx]; } +}; + +/** + * @brief Analyze plan and compute materialization decisions. + * + * Walks the plan tree in post-order, determining for each join node: + * 1. Which column the parent needs as join key (MATERIALIZE) + * 2. All other columns (DEFER) + * 3. Provenance for each column back to base table + * 4. Pre-computed collection mode based on output columns + * + * @param plan Original query plan. + * @return DeferredPlan with materialization decisions. + */ +DeferredPlan analyze_plan(const Plan &plan); + +} // namespace Contest diff --git a/include/data_model/plan.h b/include/data_model/plan.h index 99c623e..897a8e2 100644 --- a/include/data_model/plan.h +++ b/include/data_model/plan.h @@ -33,7 +33,8 @@ #endif /** - * @brief RAII mmap wrapper with refcount. munmap on last ref release. Move-only. + * @brief RAII mmap wrapper with refcount. munmap on last ref release. + * Move-only. */ class MappedMemory { public: @@ -127,8 +128,8 @@ constexpr size_t PAGE_SIZE = 8192; * @brief 8-byte aligned page (8KB) for columnar data. * * INT32: [num_rows:u16][num_values:u16][values...][bitmap at end] - * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string bytes][bitmap] - * Long string markers: 0xFFFF (first), 0xFFFE (continuation). + * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string + * bytes][bitmap] Long string markers: 0xFFFF (first), 0xFFFE (continuation). * Dense page (no NULLs): num_rows == num_values → fast path. */ struct alignas(8) Page { @@ -219,7 +220,8 @@ struct Plan { size_t root; /**< Index of root node in nodes. */ /** - * @brief Create JoinNode. @return node index. Execution may override build_left. + * @brief Create JoinNode. @return node index. Execution may override + * build_left. */ size_t new_join_node(bool build_left, size_t left, size_t right, size_t left_attr, @@ -282,7 +284,8 @@ template struct ColumnInserter { bitmap.resize(PAGE_SIZE); } - /** @brief Get current page, allocating if needed. Does not advance index. */ + /** @brief Get current page, allocating if needed. Does not advance index. + */ std::byte *get_page() { if (last_page_idx == column.pages.size()) [[unlikely]] { column.new_page(); @@ -369,7 +372,8 @@ template <> struct ColumnInserter { bitmap.resize(PAGE_SIZE); } - /** @brief Get current page, allocating if needed. Does not advance index. */ + /** @brief Get current page, allocating if needed. Does not advance index. + */ std::byte *get_page() { if (last_page_idx == column.pages.size()) [[unlikely]] { column.new_page(); @@ -378,7 +382,8 @@ template <> struct ColumnInserter { return page->data; } - /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE markers. */ + /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE + * markers. */ void save_long_string(std::string_view value) { size_t offset = 0; auto first_page = true; @@ -484,6 +489,8 @@ struct TimingStats { int64_t setup_ms = 0; /**< JoinSetup + build/probe selection. */ int64_t total_execution_ms = 0; /**< Wall-clock total for execute(). */ int64_t intermediate_ms = 0; /**< construct_intermediate for non-root. */ + int64_t analyze_plan_ms = 0; /**< Deferred: plan analysis time. */ + int64_t deferred_resolve_ms = 0; /**< Deferred: column resolution time. */ }; /** @brief Allocate execution context (worker pool, shared state). */ diff --git a/include/materialization/construct_deferred.h b/include/materialization/construct_deferred.h new file mode 100644 index 0000000..5ba8b3e --- /dev/null +++ b/include/materialization/construct_deferred.h @@ -0,0 +1,446 @@ +/** + * @file construct_deferred.h + * @brief Constructs deferred intermediate results for multi-way joins. + * + * Allocates and populates DeferredResult with only MATERIALIZE columns + * (typically just the parent's join key). Row ID columns are always + * populated for provenance tracking. + * + * @see construct_intermediate.h for the eager materialization equivalent. + * @see materialize_deferred.h for final resolution of deferred columns. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace Contest { +namespace materialize { + +using Contest::io::ColumnarReader; +using Contest::join::MatchCollectionMode; +using Contest::join::ThreadLocalMatchBuffer; +using Contest::platform::THREAD_COUNT; +using Contest::platform::worker_pool; + +/** + * @brief Collect columns needed from a DeferredInput for page index building. + */ +inline platform::ArenaVector +collect_deferred_columns(const DeferredInput &input, + const platform::ArenaVector &needed, + platform::ThreadArena &arena) { + platform::ArenaVector columns(arena); + if (!input.node) + return columns; + + columns.resize(input.node->output_attrs.size()); + std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); + + if (!input.is_columnar()) + return columns; + + auto *table = std::get(input.data); + for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { + if (i < needed.size() && needed[i]) { + auto [actual_col_idx, _] = input.node->output_attrs[i]; + columns[i] = &table->columns[actual_col_idx]; + } + } + return columns; +} + +/** + * @brief Prepare ColumnarReader for deferred materialization path. + * + * Sets up page indices for columns that need to be read from columnar inputs. + */ +inline void prepare_deferred_columns( + ColumnarReader &reader, const DeferredInput &build_input, + const DeferredInput &probe_input, const DeferredJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left) { + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + if (!build_is_columnar && !probe_is_columnar) + return; + + auto &arena = Contest::platform::get_arena(0); + + // Determine which columns from each side are needed + platform::ArenaVector build_needed(arena); + if (build_input.node) { + build_needed.resize(build_input.node->output_attrs.size()); + std::memset(build_needed.data(), 0, build_needed.size()); + } + + platform::ArenaVector probe_needed(arena); + if (probe_input.node) { + probe_needed.resize(probe_input.node->output_attrs.size()); + std::memset(probe_needed.data(), 0, probe_needed.size()); + } + + // Mark columns needed based on materialization decisions + // from_left refers to original left child + // build_is_left tells us if build side is the left child + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && + col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + } + } + + if (build_is_columnar) { + reader.prepare_build( + collect_deferred_columns(build_input, build_needed, arena)); + } + + if (probe_is_columnar) { + reader.prepare_probe( + collect_deferred_columns(probe_input, probe_needed, arena)); + } +} + +/** + * @brief Create empty deferred result with proper schema. + * + * Used when total_matches == 0. Creates empty materialized columns + * for columns marked MATERIALIZE so they can be used in subsequent joins. + */ +inline DeferredResult +create_empty_deferred_result(const DeferredJoinNode &node) { + DeferredResult result; + result.node_info = &node; + result.num_rows = 0; + result.materialized_map.resize(node.columns.size(), std::nullopt); + result.table_ids = node.tracked_table_ids; + + // Count and allocate empty materialized columns + size_t mat_count = 0; + for (const auto &col : node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + result.materialized_map[col.original_idx] = mat_count++; + } + } + result.materialized.resize(mat_count); + // Each column has 0 rows, which is valid for empty result + + // Also create empty row ID columns + result.row_ids.resize(node.tracked_table_ids.size()); + for (size_t i = 0; i < node.tracked_table_ids.size(); ++i) { + result.row_ids[i].table_id = node.tracked_table_ids[i]; + } + + return result; +} + +/** + * @brief Precomputed metadata for row ID column sources. + * + * Mirrors RowIdSource from construct_intermediate.h but adapted for + * DeferredInput. + */ +struct DeferredRowIdSource { + const mema::rowid_column_t *source_col = + nullptr; ///< Source if from intermediate. + uint8_t table_id = 0; ///< Table ID for encoding. + bool from_build = false; ///< True if from build side. + bool needs_encode = false; ///< True if columnar (needs GlobalRowId encode). +}; + +/** + * @brief Prepare row ID sources for deferred intermediate construction. + */ +inline std::vector +prepare_deferred_rowid_sources(const std::vector &merged_table_ids, + const DeferredInput &build_input, + const DeferredInput &probe_input) { + std::vector sources; + sources.reserve(merged_table_ids.size()); + + for (uint8_t tid : merged_table_ids) { + DeferredRowIdSource src; + src.table_id = tid; + + // Check build side first + auto build_tables = build_input.tracked_tables(); + bool in_build = std::find(build_tables.begin(), build_tables.end(), + tid) != build_tables.end(); + if (in_build) { + src.from_build = true; + if (build_input.is_columnar()) { + src.needs_encode = true; + src.source_col = nullptr; + } else { + src.needs_encode = false; + src.source_col = build_input.get_rowid_column(tid); + } + } else { + // Must be from probe side + src.from_build = false; + if (probe_input.is_columnar()) { + src.needs_encode = true; + src.source_col = nullptr; + } else { + src.needs_encode = false; + src.source_col = probe_input.get_rowid_column(tid); + } + } + sources.push_back(src); + } + return sources; +} + +/** + * @brief Constructs deferred intermediate result from thread-local buffers. + * + * Only materializes columns marked MATERIALIZE in the DeferredJoinNode. + * All row ID columns are populated for provenance tracking. + * + * @tparam Mode Collection mode for compile-time specialization. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side data source. + * @param probe_input Probe side data source. + * @param join_node Deferred join node with materialization decisions. + * @param remapped_attrs Output attributes (after build/probe remapping). + * @param build_output_size Number of columns from build side. + * @param columnar_reader Reader for columnar data access. + * @param out_result Output DeferredResult (populated in-place). + * @param merged_table_ids Sorted table IDs to track. + * @param deferred_plan Full deferred plan for base table access (deferred + * resolution). + */ +template +void construct_deferred_from_buffers( + std::vector> &buffers, + const DeferredInput &build_input, const DeferredInput &probe_input, + const DeferredJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_output_size, bool build_is_left, + ColumnarReader &columnar_reader, DeferredResult &out_result, + const std::vector &merged_table_ids, + const DeferredPlan &deferred_plan) { + + // Count total matches + size_t total_matches = 0; + std::vector buffer_starts(buffers.size()); + for (size_t i = 0; i < buffers.size(); ++i) { + buffer_starts[i] = total_matches; + total_matches += buffers[i].count(); + } + + if (total_matches == 0) { + out_result = create_empty_deferred_result(join_node); + return; + } + + out_result.node_info = &join_node; + out_result.num_rows = total_matches; + out_result.table_ids = merged_table_ids; + + // Build materialized_map: count MATERIALIZE columns and create mapping + // materialized_map[original_idx] -> index into out_result.materialized + out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); + size_t mat_count = 0; + + // Iterate over join_node.columns (which uses original output order) + // and assign materialized indices to MATERIALIZE columns + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + out_result.materialized_map[col.original_idx] = mat_count++; + } + } + + // Prepare row ID sources + auto rowid_sources = prepare_deferred_rowid_sources( + merged_table_ids, build_input, probe_input); + + const size_t num_rowid_cols = rowid_sources.size(); + + // Pre-allocate pages + using Page = mema::column_t::Page; + using RowIdPage = mema::rowid_column_t::Page; + size_t total_pages_needed = + (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; + + // Allocate materialized columns + out_result.materialized.resize(mat_count); + for (size_t c = 0; c < mat_count; ++c) { + out_result.materialized[c].pages.resize(total_pages_needed); + out_result.materialized[c].set_row_count(total_matches); + } + + // Allocate row ID columns + out_result.row_ids.resize(num_rowid_cols); + for (size_t r = 0; r < num_rowid_cols; ++r) { + out_result.row_ids[r].table_id = merged_table_ids[r]; + out_result.row_ids[r].pages.resize(total_pages_needed); + out_result.row_ids[r].set_row_count(total_matches); + } + + // Parallel page allocation + const size_t num_threads = THREAD_COUNT; + worker_pool().execute([&](size_t t) { + for (size_t c = 0; c < mat_count; ++c) { + auto &col = out_result.materialized[c]; + for (size_t p = t; p < total_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + col.pages[p] = reinterpret_cast(ptr); + } + } + for (size_t r = 0; r < num_rowid_cols; ++r) { + auto &rid_col = out_result.row_ids[r]; + for (size_t p = t; p < total_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + rid_col.pages[p] = reinterpret_cast(ptr); + } + } + }); + + // Set source metadata for materialized columns + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + size_t mat_idx = *out_result.materialized_map[col.original_idx]; + out_result.materialized[mat_idx].source_table = + col.provenance.base_table_id; + out_result.materialized[mat_idx].source_column = + col.provenance.base_column_idx; + } + } + + // Parallel population: each thread processes its own buffer + worker_pool().execute([&](size_t t) { + if (t >= buffers.size()) + return; + auto &buf = buffers[t]; + size_t my_count = buf.count(); + if (my_count == 0) + return; + + size_t start = buffer_starts[t]; + ColumnarReader::Cursor cursor; + + // Iterate through matches + auto left_it = buf.left_range().begin(); + auto right_it = buf.right_range().begin(); + + for (size_t m = 0; m < my_count; ++m) { + uint32_t build_row = 0, probe_row = 0; + + if constexpr (Mode == MatchCollectionMode::BOTH) { + build_row = *left_it; + probe_row = *right_it; + ++left_it; + ++right_it; + } else if constexpr (Mode == MatchCollectionMode::LEFT_ONLY) { + build_row = *left_it; + ++left_it; + } else { + probe_row = *right_it; + ++right_it; + } + + size_t out_row = start + m; + + // Write materialized columns + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + + size_t mat_col_idx = + *out_result.materialized_map[col.original_idx]; + auto &out_col = out_result.materialized[mat_col_idx]; + + // Determine source based on from_left and build/probe mapping + // col.from_left refers to original left child + // build_is_left tells us if build side is the left child + // If from_left && build_is_left => from build + // If from_left && !build_is_left => from probe (left became + // probe) + bool from_build = (col.from_left == build_is_left); + uint32_t src_row = from_build ? build_row : probe_row; + const auto &src_input = from_build ? build_input : probe_input; + + mema::value_t val; + if (src_input.is_columnar()) { + const auto *table = + std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col.child_output_idx]; + val = columnar_reader.read_value( + table->columns[actual_idx], col.child_output_idx, + src_row, col.type, cursor, from_build); + } else { + const auto &ir = std::get(src_input.data); + // Check if materialized in child + const auto *src_col = + ir.get_materialized(col.child_output_idx); + if (src_col) { + val = (*src_col)[src_row]; + } else { + // Deferred - resolve via row ID to base table + // This should only happen if materialization wasn't + // propagated properly. Use direct read as fallback. + const auto *rowid_col = + ir.get_rowid_column(col.provenance.base_table_id); + if (rowid_col && deferred_plan.original_plan) { + uint32_t encoded = (*rowid_col)[src_row]; + uint32_t base_row = GlobalRowId::row(encoded); + const auto &base_table = + deferred_plan.original_plan + ->inputs[col.provenance.base_table_id]; + val = columnar_reader.read_value_direct_public( + base_table + .columns[col.provenance.base_column_idx], + base_row, col.type); + } else { + val = mema::value_t{mema::value_t::NULL_VALUE}; + } + } + } + + out_col.write_at(out_row, val); + } + + // Write row ID columns + for (size_t r = 0; r < num_rowid_cols; ++r) { + const auto &rid_src = rowid_sources[r]; + auto &dest_rid_col = out_result.row_ids[r]; + + uint32_t local_idx = rid_src.from_build ? build_row : probe_row; + + if (rid_src.needs_encode) { + dest_rid_col.write_at( + out_row, + GlobalRowId::encode(rid_src.table_id, local_idx)); + } else if (rid_src.source_col) { + dest_rid_col.write_at(out_row, + (*rid_src.source_col)[local_idx]); + } + } + } + }); +} + +} // namespace materialize +} // namespace Contest diff --git a/include/materialization/materialize_deferred.h b/include/materialization/materialize_deferred.h new file mode 100644 index 0000000..8b548c0 --- /dev/null +++ b/include/materialization/materialize_deferred.h @@ -0,0 +1,439 @@ +/** + * @file materialize_deferred.h + * @brief Final materialization for deferred execution path. + * + * Materializes all output columns at the root join, resolving deferred + * columns by following row ID provenance back to base tables. + * + * @see construct_deferred.h for building DeferredResult intermediates. + * @see materialize.h for the eager materialization equivalent. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Contest { +namespace materialize { + +using Contest::io::ColumnarReader; +using Contest::join::MatchCollectionMode; +using Contest::join::ThreadLocalMatchBuffer; +using Contest::platform::THREAD_COUNT; +using Contest::platform::worker_pool; + +/** + * @brief Collect columns needed from a DeferredInput for final materialization. + */ +inline platform::ArenaVector +collect_final_columns(const DeferredInput &input, + const platform::ArenaVector &needed, + platform::ThreadArena &arena) { + platform::ArenaVector columns(arena); + if (!input.node) + return columns; + + columns.resize(input.node->output_attrs.size()); + std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); + + if (!input.is_columnar()) + return columns; + + auto *table = std::get(input.data); + for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { + if (i < needed.size() && needed[i]) { + auto [actual_col_idx, _] = input.node->output_attrs[i]; + columns[i] = &table->columns[actual_col_idx]; + } + } + return columns; +} + +/** + * @brief Prepare ColumnarReader for final deferred materialization at root. + * + * Sets up page indices for ALL output columns (since all need materialization + * at root). + */ +inline void prepare_final_deferred_columns( + ColumnarReader &reader, const DeferredInput &build_input, + const DeferredInput &probe_input, const DeferredJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left) { + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + if (!build_is_columnar && !probe_is_columnar) + return; + + auto &arena = Contest::platform::get_arena(0); + + // All output columns needed at root + platform::ArenaVector build_needed(arena); + if (build_input.node) { + build_needed.resize(build_input.node->output_attrs.size()); + std::memset(build_needed.data(), 0, build_needed.size()); + } + + platform::ArenaVector probe_needed(arena); + if (probe_input.node) { + probe_needed.resize(probe_input.node->output_attrs.size()); + std::memset(probe_needed.data(), 0, probe_needed.size()); + } + + // Mark ALL columns needed for final materialization + // from_left refers to original left child + // build_is_left tells us if build side is the left child + for (const auto &col : join_node.columns) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + } + + if (build_is_columnar) { + reader.prepare_build( + collect_final_columns(build_input, build_needed, arena)); + } + + if (probe_is_columnar) { + reader.prepare_probe( + collect_final_columns(probe_input, probe_needed, arena)); + } +} + +/** + * @brief Create empty result for zero-match case in deferred path. + */ +inline ColumnarTable create_empty_deferred_final( + const std::vector> &output_attrs) { + ColumnarTable empty_result; + empty_result.num_rows = 0; + for (auto [_, data_type] : output_attrs) { + empty_result.columns.emplace_back(data_type); + } + return empty_result; +} + +/** + * @brief Materialize a single column from deferred sources. + * + * Handles three cases: + * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index + * 2. MATERIALIZED: Column was materialized in DeferredResult + * 3. DEFERRED: Resolve via row ID lookup to base table + * + * @tparam Mode Collection mode for compile-time specialization. + * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. + * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t. + * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType. + */ +template +inline void materialize_deferred_column( + Column &dest_col, std::vector> &buffers, + size_t total_matches, ReaderFunc &&read_value, + InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) { + + if (total_matches == 0) + return; + + const int num_threads = THREAD_COUNT; + + size_t matches_per_thread = (total_matches + num_threads - 1) / num_threads; + size_t usable_per_page = PAGE_SIZE - 256; + size_t rows_per_page = std::max(1ul, usable_per_page / est_bytes_per_row); + size_t pages_per_thread = + (matches_per_thread + rows_per_page - 1) / rows_per_page + 10; + size_t total_pages = pages_per_thread * num_threads; + + void *page_memory = + mmap(nullptr, total_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (page_memory == MAP_FAILED) + throw std::bad_alloc(); + + std::vector thread_columns; + thread_columns.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + thread_columns.emplace_back(dest_col.type); + } + + worker_pool().execute([&](size_t t) { + if (t >= buffers.size()) + return; + auto &buf = buffers[t]; + size_t my_count = buf.count(); + if (my_count == 0) + return; + + Column &local_col = thread_columns[t]; + + size_t thread_page_start = t * pages_per_thread; + size_t thread_page_limit = pages_per_thread; + size_t used_pages = 0; + + ColumnarReader::Cursor cursor; + + auto page_allocator = [&]() -> Page * { + Page *p; + if (used_pages < thread_page_limit) { + p = reinterpret_cast(static_cast(page_memory) + + (thread_page_start + used_pages) * + PAGE_SIZE); + used_pages++; + } else { + p = new Page(); + } + local_col.pages.push_back(p); + return p; + }; + + BuilderType builder = init_builder(page_allocator); + builder.prepare(my_count); + + const size_t check_interval = BuilderType::MIN_ROWS_PER_PAGE_CHECK; + size_t rows_since_check = 0; + + auto range = from_build ? buf.left_range() : buf.right_range(); + + for (uint32_t row_id : range) { + bool flushed = builder.add(read_value(row_id, cursor)); + + if (flushed) { + rows_since_check = 0; + } else { + rows_since_check++; + if (rows_since_check >= check_interval) { + if (builder.should_check_overflow()) { + builder.save_to_page(builder.current_page); + rows_since_check = 0; + } + if (rows_since_check > check_interval * 2) + rows_since_check = 0; + } + } + } + + if (builder.num_rows != 0) { + builder.save_to_page(builder.current_page); + } + }); + + for (auto &thread_col : thread_columns) { + for (auto *page : thread_col.pages) { + dest_col.pages.push_back(page); + } + thread_col.pages.clear(); + } + + auto *mapped_mem = new MappedMemory(page_memory, total_pages * PAGE_SIZE); + dest_col.assign_mapped_memory(mapped_mem); +} + +/** + * @brief Materialize single output column handling deferred resolution. + * + * For deferred columns, resolves via row ID tracking back to base table. + * + * @tparam Mode Collection mode for compile-time specialization. + */ +template +inline void materialize_single_deferred_column( + Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left, + std::vector> &buffers, size_t total_matches, + const DeferredInput &build_input, const DeferredInput &probe_input, + const DeferredJoinNode &join_node, ColumnarReader &columnar_reader, + const DeferredPlan &deferred_plan) { + + // Find column info + const DeferredColumnInfo *col_info = nullptr; + for (const auto &col : join_node.columns) { + if (col.original_idx == col_idx) { + col_info = &col; + break; + } + } + + if (!col_info) { + // Fallback - shouldn't happen + return; + } + + // Determine if this column comes from build or probe side at runtime + // col_info->from_left refers to original left child + // build_is_left tells us if build side is the left child + // If from_left && build_is_left => from build + // If from_left && !build_is_left => from probe (left became probe) + // If !from_left && build_is_left => from probe (right is probe) + // If !from_left && !build_is_left => from build (right became build) + bool from_build = (col_info->from_left == build_is_left); + const DeferredInput &src_input = from_build ? build_input : probe_input; + + // Determine how to read the value + const Column *columnar_source = nullptr; + const mema::column_t *materialized_source = nullptr; + const mema::rowid_column_t *rowid_source = nullptr; + const Column *base_table_column = nullptr; + + if (src_input.is_columnar()) { + // Direct columnar read + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col_info->child_output_idx]; + columnar_source = &table->columns[actual_idx]; + } else { + const auto &ir = std::get(src_input.data); + if (ir.is_materialized(col_info->child_output_idx)) { + // Read from materialized column + materialized_source = + ir.get_materialized(col_info->child_output_idx); + } else { + // Deferred - need to resolve via row ID + rowid_source = + ir.get_rowid_column(col_info->provenance.base_table_id); + if (deferred_plan.original_plan) { + base_table_column = + &deferred_plan.original_plan + ->inputs[col_info->provenance.base_table_id] + .columns[col_info->provenance.base_column_idx]; + } + } + } + + // Create reader lambda + auto reader = [&](uint32_t local_row_id, + ColumnarReader::Cursor &cursor) -> mema::value_t { + if (columnar_source) { + return columnar_reader.read_value( + *columnar_source, col_info->child_output_idx, local_row_id, + col_info->type, cursor, from_build); + } else if (materialized_source) { + return (*materialized_source)[local_row_id]; + } else if (rowid_source && base_table_column) { + // Deferred resolution: get base table row from encoded row ID + uint32_t encoded = (*rowid_source)[local_row_id]; + uint32_t base_row = GlobalRowId::row(encoded); + return columnar_reader.read_value( + *base_table_column, col_info->provenance.base_column_idx, + base_row, col_info->type, cursor, true); + } + return mema::value_t{mema::value_t::NULL_VALUE}; + }; + + // Materialize based on type + if (dest_col.type == DataType::INT32) { + auto init = [](std::function alloc) { + return Int32PageBuilder(std::move(alloc)); + }; + materialize_deferred_column( + dest_col, buffers, total_matches, + [&](uint32_t rid, ColumnarReader::Cursor &cursor) { + return reader(rid, cursor); + }, + init, from_build, 4); + return; + } + + // VARCHAR + const Column *str_src_ptr = columnar_source; + if (!str_src_ptr) { + if (materialized_source) { + str_src_ptr = &deferred_plan.original_plan + ->inputs[materialized_source->source_table] + .columns[materialized_source->source_column]; + } else if (base_table_column) { + str_src_ptr = base_table_column; + } + } + + if (!str_src_ptr) { + // Shouldn't happen, but handle gracefully + return; + } + + auto init = [str_src_ptr](std::function alloc) { + return VarcharPageBuilder(*str_src_ptr, std::move(alloc)); + }; + + materialize_deferred_column( + dest_col, buffers, total_matches, + [&](uint32_t rid, ColumnarReader::Cursor &cursor) { + return reader(rid, cursor); + }, + init, from_build, 35); +} + +/** + * @brief Materialize all output columns from deferred intermediate. + * + * For root join in deferred execution path. Resolves all deferred columns + * by following row ID provenance to base tables. + * + * @tparam Mode Collection mode for compile-time specialization. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side deferred input. + * @param probe_input Probe side deferred input. + * @param join_node Deferred join node with column info. + * @param remapped_attrs Output projection after build/probe remapping. + * @param build_size Number of columns from build side. + * @param columnar_reader Reader for columnar data. + * @param deferred_plan Full deferred plan for base table access. + * @return ColumnarTable with final output. + */ +template +inline ColumnarTable materialize_deferred_from_buffers( + std::vector> &buffers, + const DeferredInput &build_input, const DeferredInput &probe_input, + const DeferredJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left, ColumnarReader &columnar_reader, + const DeferredPlan &deferred_plan) { + + // Compute total matches + size_t total_matches = 0; + for (const auto &buf : buffers) { + total_matches += buf.count(); + } + + if (total_matches == 0) { + return create_empty_deferred_final(remapped_attrs); + } + + ColumnarTable result; + result.num_rows = total_matches; + + for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) { + auto [col_idx, data_type] = remapped_attrs[out_idx]; + result.columns.emplace_back(data_type); + Column &dest_col = result.columns.back(); + + // Pass out_idx (output position) not col_idx (global column index) + // because materialize_single_deferred_column searches by original_idx + // which is the output position in join_node.columns + materialize_single_deferred_column( + dest_col, out_idx, build_size, build_is_left, buffers, + total_matches, build_input, probe_input, join_node, columnar_reader, + deferred_plan); + } + + return result; +} + +} // namespace materialize +} // namespace Contest diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp new file mode 100644 index 0000000..f0ef0a8 --- /dev/null +++ b/src/analyze_plan.cpp @@ -0,0 +1,311 @@ +/** + * @file analyze_plan.cpp + * @brief Analyzes query plan and computes materialization decisions. + * + * Walks the plan tree in post-order to determine which columns should be + * materialized eagerly (join keys needed by parent) vs deferred until final + * output. Traces column provenance back to base tables for deferred resolution. + * + * @see deferred_plan.h for DeferredPlan structure. + */ +#include +#include +#include + +#include + +namespace Contest { + +namespace { + +/** + * @brief Parent relationship info for a node. + */ +struct ParentInfo { + size_t parent_idx; ///< Parent node index in Plan::nodes. + bool is_left_child; ///< True if this node is parent's left child. +}; + +/** + * @brief Build map of node_idx → parent info. + * + * Root node will not have an entry in the map. + */ +std::unordered_map build_parent_map(const Plan &plan) { + std::unordered_map parent_map; + + for (size_t i = 0; i < plan.nodes.size(); ++i) { + const auto &node = plan.nodes[i]; + if (const auto *join = std::get_if(&node.data)) { + parent_map[join->left] = {i, true}; + parent_map[join->right] = {i, false}; + } + } + return parent_map; +} + +/** + * @brief Trace column provenance to base table. + * + * Recursively follows column through join nodes until reaching a scan node. + * + * @param plan Original query plan. + * @param node_idx Current node index. + * @param column_idx Column index in node's output_attrs. + * @return ColumnProvenance with base table ID and column index. + */ +ColumnProvenance trace_provenance(const Plan &plan, size_t node_idx, + size_t column_idx) { + const auto &node = plan.nodes[node_idx]; + + if (const auto *scan = std::get_if(&node.data)) { + // Base case: column comes directly from scan + auto [actual_col_idx, _] = node.output_attrs[column_idx]; + return ColumnProvenance{static_cast(scan->base_table_id), + static_cast(actual_col_idx)}; + } + + // Join node: determine which child the column comes from + const auto &join = std::get(node.data); + const auto &left_node = plan.nodes[join.left]; + size_t left_size = left_node.output_attrs.size(); + + auto [col_idx, _] = node.output_attrs[column_idx]; + + if (col_idx < left_size) { + // Column from left child + return trace_provenance(plan, join.left, col_idx); + } else { + // Column from right child + return trace_provenance(plan, join.right, col_idx - left_size); + } +} + +/** + * @brief Find which column index in this node the parent needs as join key. + * + * @param plan Original query plan. + * @param node_idx Current node index. + * @param parent_map Map of node → parent relationship. + * @return Column index parent uses as join key, or nullopt if root. + */ +std::optional +find_parent_join_key(const Plan &plan, size_t node_idx, + const std::unordered_map &parent_map) { + auto it = parent_map.find(node_idx); + if (it == parent_map.end()) { + return std::nullopt; // Root node + } + + const auto &parent_node = plan.nodes[it->second.parent_idx]; + const auto &parent_join = std::get(parent_node.data); + + // Parent's join key for this child + return it->second.is_left_child ? parent_join.left_attr + : parent_join.right_attr; +} + +/** + * @brief Compute base collection mode based on which sides have output columns. + * + * Assumes build=left. If build=right at runtime, caller flips + * LEFT_ONLY/RIGHT_ONLY. + */ +join::MatchCollectionMode +compute_base_collection_mode(const std::vector &columns, + size_t left_output_size) { + bool needs_left = false; + bool needs_right = false; + + for (const auto &col : columns) { + if (col.from_left) { + needs_left = true; + } else { + needs_right = true; + } + if (needs_left && needs_right) { + return join::MatchCollectionMode::BOTH; + } + } + + if (needs_left && !needs_right) + return join::MatchCollectionMode::LEFT_ONLY; + if (needs_right && !needs_left) + return join::MatchCollectionMode::RIGHT_ONLY; + return join::MatchCollectionMode::BOTH; +} + +/** + * @brief Collect tracked table IDs from a DeferredNode. + */ +std::vector get_tracked_tables(const DeferredNode &node) { + if (const auto *scan = std::get_if(&node)) { + return {scan->base_table_id}; + } + return std::get(node).tracked_table_ids; +} + +/** + * @brief Merge tracked table IDs from two children (sorted, unique). + */ +std::vector merge_table_ids(const DeferredNode &left, + const DeferredNode &right) { + auto left_ids = get_tracked_tables(left); + auto right_ids = get_tracked_tables(right); + + std::vector result; + result.reserve(left_ids.size() + right_ids.size()); + + std::merge(left_ids.begin(), left_ids.end(), right_ids.begin(), + right_ids.end(), std::back_inserter(result)); + + result.erase(std::unique(result.begin(), result.end()), result.end()); + return result; +} + +} // anonymous namespace + +DeferredPlan analyze_plan(const Plan &plan) { + DeferredPlan deferred; + deferred.original_plan = &plan; + deferred.nodes.resize(plan.nodes.size()); + deferred.root = plan.root; + + auto parent_map = build_parent_map(plan); + + // Build post-order traversal (children before parents) + std::vector post_order; + post_order.reserve(plan.nodes.size()); + std::vector visited(plan.nodes.size(), false); + + std::function visit = [&](size_t idx) { + if (visited[idx]) + return; + visited[idx] = true; + + const auto &node = plan.nodes[idx]; + if (const auto *join = std::get_if(&node.data)) { + visit(join->left); + visit(join->right); + } + post_order.push_back(idx); + }; + visit(plan.root); + + // PASS 1: Build structure and initial materialization decisions + for (size_t node_idx : post_order) { + const auto &node = plan.nodes[node_idx]; + + if (const auto *scan = std::get_if(&node.data)) { + // Scan node: simple wrapper + DeferredScanNode dscan; + dscan.node_idx = node_idx; + dscan.base_table_id = scan->base_table_id; + dscan.output_attrs = node.output_attrs; + deferred.nodes[node_idx] = std::move(dscan); + + } else { + // Join node: compute materialization decisions + const auto &join = std::get(node.data); + DeferredJoinNode djoin; + djoin.node_idx = node_idx; + djoin.left_child_idx = join.left; + djoin.right_child_idx = join.right; + djoin.left_join_attr = join.left_attr; + djoin.right_join_attr = join.right_attr; + djoin.output_attrs = node.output_attrs; + djoin.is_root = (node_idx == plan.root); + + // Find which column parent needs as join key + djoin.parent_join_key_idx = + find_parent_join_key(plan, node_idx, parent_map); + + // Get child sizes for determining column source + const auto &left_node = plan.nodes[join.left]; + size_t left_size = left_node.output_attrs.size(); + + // Build column info for each output column + for (size_t i = 0; i < node.output_attrs.size(); ++i) { + auto [col_idx, col_type] = node.output_attrs[i]; + + DeferredColumnInfo info; + info.original_idx = i; + info.type = col_type; + + // Determine if column is from left or right child + // col_idx is the combined L+R index: + // - [0, left_size) = position in left child's output + // - [left_size, ...) = position in right child's output + + // left_size + if (col_idx < left_size) { + info.from_left = true; + info.child_output_idx = col_idx; + } else { + info.from_left = false; + info.child_output_idx = col_idx - left_size; + } + + // Materialization decision: + // - At root: ALL columns must be materialized (final output) + // - At intermediate: only parent's join key is materialized + if (djoin.is_root) { + // Root node: materialize everything + info.resolution = ColumnResolution::MATERIALIZE; + } else if (djoin.parent_join_key_idx.has_value() && + i == *djoin.parent_join_key_idx) { + info.resolution = ColumnResolution::MATERIALIZE; + } else { + info.resolution = ColumnResolution::DEFER; + } + + // Trace provenance to base table + info.provenance = trace_provenance(plan, node_idx, i); + + djoin.columns.push_back(std::move(info)); + } + + // Compute collection mode and tracked tables + djoin.base_collection_mode = + compute_base_collection_mode(djoin.columns, left_size); + djoin.tracked_table_ids = merge_table_ids( + deferred.nodes[join.left], deferred.nodes[join.right]); + + deferred.nodes[node_idx] = std::move(djoin); + } + } + + // PASS 2: Propagate materialization requirements to children + // Process in reverse post-order (parents before children) + for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) { + size_t node_idx = *it; + auto *djoin = std::get_if(&deferred.nodes[node_idx]); + if (!djoin) + continue; + + // For each column that must be MATERIALIZE, ensure the child also + // materializes it + for (const auto &col : djoin->columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + + // Find which child this column comes from + size_t child_idx = + col.from_left ? djoin->left_child_idx : djoin->right_child_idx; + + auto *child_djoin = + std::get_if(&deferred.nodes[child_idx]); + if (!child_djoin) + continue; // Child is a scan - always has data + + // Mark child's column as MATERIALIZE + if (col.child_output_idx < child_djoin->columns.size()) { + child_djoin->columns[col.child_output_idx].resolution = + ColumnResolution::MATERIALIZE; + } + } + } + + return deferred; +} + +} // namespace Contest diff --git a/src/execute.cpp b/src/execute.cpp index a9589ad..d8ef462 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -40,6 +40,13 @@ #include #include +#ifdef USE_DEFERRED_MATERIALIZATION +#include +#include +#include +#include +#endif + namespace Contest { using namespace join; @@ -291,6 +298,349 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, return ExtendedResult{}; } +#ifdef USE_DEFERRED_MATERIALIZATION +// ============================================================================ +// DEFERRED MATERIALIZATION PATH +// ============================================================================ + +using DeferredJoinResult = std::variant; + +using materialize::construct_deferred_from_buffers; +using materialize::create_empty_deferred_result; +using materialize::materialize_deferred_from_buffers; + +// Forward declaration +DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, + size_t node_idx, bool is_root, + TimingStats &stats); + +/** + * @brief Resolve deferred plan node to DeferredInput. + */ +DeferredInput resolve_deferred_input(const DeferredPlan &deferred_plan, + size_t node_idx, TimingStats &stats) { + DeferredInput input; + const auto &dnode = deferred_plan[node_idx]; + const auto &pnode = deferred_plan.original_plan->nodes[node_idx]; + input.node = &pnode; + input.deferred_node = &dnode; + + if (const auto *dscan = std::get_if(&dnode)) { + input.data = &deferred_plan.original_plan->inputs[dscan->base_table_id]; + input.table_id = dscan->base_table_id; + } else { + auto result = + execute_deferred_impl(deferred_plan, node_idx, false, stats); + input.data = std::get(std::move(result)); + input.table_id = 0; + } + return input; +} + +/** + * @brief Select build/probe sides for deferred input. + */ +BuildProbeConfig select_deferred_build_probe_side( + const JoinNode &join, const DeferredInput &left_input, + const DeferredInput &right_input, + const std::vector> &output_attrs) { + BuildProbeConfig config; + + size_t left_rows = left_input.row_count(join.left_attr); + size_t right_rows = right_input.row_count(join.right_attr); + config.build_left = left_rows <= right_rows; + + config.build_attr = config.build_left ? join.left_attr : join.right_attr; + config.probe_attr = config.build_left ? join.right_attr : join.left_attr; + + config.remapped_attrs = output_attrs; + size_t left_size = left_input.output_size(); + size_t build_size = + config.build_left ? left_size : right_input.output_size(); + + if (!config.build_left) { + for (auto &[col_idx, dtype] : config.remapped_attrs) { + if (col_idx < left_size) { + col_idx = build_size + col_idx; + } else { + col_idx = col_idx - left_size; + } + } + } + return config; +} + +/** + * @brief Unified probe + materialize for deferred path. + */ +template +DeferredJoinResult execute_deferred_join_with_mode( + bool use_nested_loop, bool probe_is_columnar, bool is_root, + const UnchainedHashtable *hash_table, const DeferredInput &build_input, + const DeferredInput &probe_input, const BuildProbeConfig &config, + const DeferredJoinNode &join_node, io::ColumnarReader &columnar_reader, + const DeferredPlan &deferred_plan, + const std::vector &merged_table_ids, TimingStats &stats) { + + std::vector> match_buffers; + + // Probe phase - need to convert DeferredInput to JoinInput for probing + // For now, handle columnar probe directly + if (use_nested_loop) { + auto nested_loop_start = std::chrono::high_resolution_clock::now(); + // Nested loop requires JoinInput - create adapter + JoinInput build_ji, probe_ji; + build_ji.node = build_input.node; + probe_ji.node = probe_input.node; + + if (build_input.is_columnar()) { + build_ji.data = std::get(build_input.data); + build_ji.table_id = build_input.table_id; + } else { + // Convert DeferredResult to ExtendedResult for compatibility + // This is a limitation - nested loop path falls back to eager + const auto &dr = std::get(build_input.data); + ExtendedResult er; + er.columns = std::move( + const_cast &>(dr.materialized)); + er.row_ids = std::move( + const_cast &>(dr.row_ids)); + er.table_ids = dr.table_ids; + build_ji.data = std::move(er); + build_ji.table_id = 0; + } + + if (probe_input.is_columnar()) { + probe_ji.data = std::get(probe_input.data); + probe_ji.table_id = probe_input.table_id; + } else { + const auto &dr = std::get(probe_input.data); + ExtendedResult er; + er.columns = std::move( + const_cast &>(dr.materialized)); + er.row_ids = std::move( + const_cast &>(dr.row_ids)); + er.table_ids = dr.table_ids; + probe_ji.data = std::move(er); + probe_ji.table_id = 0; + } + + match_buffers = nested_loop_join( + build_ji, probe_ji, config.build_attr, config.probe_attr); + auto nested_loop_end = std::chrono::high_resolution_clock::now(); + stats.nested_loop_join_ms += + std::chrono::duration_cast( + nested_loop_end - nested_loop_start) + .count(); + } else { + auto probe_start = std::chrono::high_resolution_clock::now(); + if (probe_is_columnar) { + // Create JoinInput for columnar probe + JoinInput probe_ji; + probe_ji.node = probe_input.node; + probe_ji.data = std::get(probe_input.data); + probe_ji.table_id = probe_input.table_id; + match_buffers = + probe_columnar(*hash_table, probe_ji, config.probe_attr); + } else { + const auto &probe_result = + std::get(probe_input.data); + // Probe using materialized column (should be the join key) + const auto *mat_col = + probe_result.get_materialized(config.probe_attr); + if (!mat_col) { + std::fprintf( + stderr, + "ERROR: probe join key not materialized! probe_attr=%zu " + "mat_map_size=%zu num_rows=%zu\n", + config.probe_attr, probe_result.materialized_map.size(), + probe_result.num_rows); + std::abort(); + } + match_buffers = probe_intermediate(*hash_table, *mat_col); + } + auto probe_end = std::chrono::high_resolution_clock::now(); + stats.hash_join_probe_ms += + std::chrono::duration_cast(probe_end - + probe_start) + .count(); + } + + size_t total_matches = 0; + for (const auto &buf : match_buffers) { + total_matches += buf.count(); + } + + if (is_root) { + auto mat_start = std::chrono::high_resolution_clock::now(); + DeferredJoinResult final_result; + if (total_matches == 0) { + final_result = + materialize::create_empty_deferred_final(config.remapped_attrs); + } else { + // Prepare page indices for final materialization + materialize::prepare_final_deferred_columns( + columnar_reader, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left); + + final_result = materialize_deferred_from_buffers( + match_buffers, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left, columnar_reader, deferred_plan); + } + auto mat_end = std::chrono::high_resolution_clock::now(); + stats.materialize_ms += + std::chrono::duration_cast(mat_end - + mat_start) + .count(); + return final_result; + } else { + auto inter_start = std::chrono::high_resolution_clock::now(); + DeferredResult result; + if (total_matches > 0) { + // Prepare page indices for intermediate construction + materialize::prepare_deferred_columns( + columnar_reader, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left); + + construct_deferred_from_buffers( + match_buffers, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left, columnar_reader, result, merged_table_ids, + deferred_plan); + } else { + result = create_empty_deferred_result(join_node); + } + auto inter_end = std::chrono::high_resolution_clock::now(); + stats.intermediate_ms += + std::chrono::duration_cast(inter_end - + inter_start) + .count(); + return std::move(result); + } +} + +/** + * @brief Recursive deferred join execution. + */ +DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, + size_t node_idx, bool is_root, + TimingStats &stats) { + const auto &dnode = deferred_plan[node_idx]; + + if (std::holds_alternative(dnode)) { + return DeferredResult{}; + } + + const auto &djoin = std::get(dnode); + const auto &plan = *deferred_plan.original_plan; + const auto &pnode = plan.nodes[node_idx]; + const auto &join = std::get(pnode.data); + + // Resolve inputs + DeferredInput left_input = + resolve_deferred_input(deferred_plan, djoin.left_child_idx, stats); + DeferredInput right_input = + resolve_deferred_input(deferred_plan, djoin.right_child_idx, stats); + + // Build/probe selection + auto setup_start = std::chrono::high_resolution_clock::now(); + auto config = select_deferred_build_probe_side( + join, left_input, right_input, djoin.output_attrs); + const DeferredInput &build_input = + config.build_left ? left_input : right_input; + const DeferredInput &probe_input = + config.build_left ? right_input : left_input; + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + const size_t HASH_TABLE_THRESHOLD = 8; + size_t build_rows = build_input.row_count(config.build_attr); + // Nested loop doesn't work with DeferredResult because it only has join + // keys materialized. Force hash join when either side is DeferredResult. + bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD) && + build_is_columnar && probe_is_columnar; + + // Merge table IDs + auto build_tables = build_input.tracked_tables(); + auto probe_tables = probe_input.tracked_tables(); + auto merged_table_ids = merge_tracked_tables(build_tables, probe_tables); + + io::ColumnarReader columnar_reader; + auto setup_end = std::chrono::high_resolution_clock::now(); + stats.setup_ms += std::chrono::duration_cast( + setup_end - setup_start) + .count(); + + // For deferred materialization, we always need BOTH row indices because + // we track provenance from both sides for deferred column resolution. + // The optimization to collect only one side's indices is not safe here. + MatchCollectionMode mode = MatchCollectionMode::BOTH; + + // Build hash table if needed + std::optional hash_table; + if (!use_nested_loop) { + auto build_start = std::chrono::high_resolution_clock::now(); + if (build_is_columnar) { + JoinInput build_ji; + build_ji.node = build_input.node; + build_ji.data = std::get(build_input.data); + build_ji.table_id = build_input.table_id; + hash_table = build_from_columnar(build_ji, config.build_attr); + } else { + const auto &dr = std::get(build_input.data); + const auto *mat_col = dr.get_materialized(config.build_attr); + if (!mat_col) { + std::fprintf( + stderr, + "ERROR: build join key not materialized! build_attr=%zu " + "mat_map_size=%zu num_rows=%zu\n", + config.build_attr, dr.materialized_map.size(), dr.num_rows); + // Fatal - this should never happen + std::abort(); + } + hash_table.emplace(mat_col->row_count()); + hash_table->build_intermediate(*mat_col); + } + auto build_end = std::chrono::high_resolution_clock::now(); + stats.hashtable_build_ms += + std::chrono::duration_cast(build_end - + build_start) + .count(); + } + + // Dispatch based on collection mode + switch (mode) { + case MatchCollectionMode::BOTH: + return execute_deferred_join_with_mode( + use_nested_loop, probe_is_columnar, is_root, + use_nested_loop ? nullptr : &(*hash_table), build_input, + probe_input, config, djoin, columnar_reader, deferred_plan, + merged_table_ids, stats); + + case MatchCollectionMode::LEFT_ONLY: + return execute_deferred_join_with_mode( + use_nested_loop, probe_is_columnar, is_root, + use_nested_loop ? nullptr : &(*hash_table), build_input, + probe_input, config, djoin, columnar_reader, deferred_plan, + merged_table_ids, stats); + + case MatchCollectionMode::RIGHT_ONLY: + return execute_deferred_join_with_mode( + use_nested_loop, probe_is_columnar, is_root, + use_nested_loop ? nullptr : &(*hash_table), build_input, + probe_input, config, djoin, columnar_reader, deferred_plan, + merged_table_ids, stats); + } + + return DeferredResult{}; +} + +#endif // USE_DEFERRED_MATERIALIZATION + /** * @brief Public entry point: execute plan from root, return ColumnarTable. * @param plan Query plan with nodes and base tables. @@ -307,7 +657,27 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, auto total_start = std::chrono::high_resolution_clock::now(); TimingStats stats; + +#ifdef USE_DEFERRED_MATERIALIZATION + // Deferred materialization path: analyze plan, then execute with deferred + // intermediate construction + auto analyze_start = std::chrono::high_resolution_clock::now(); + DeferredPlan deferred_plan = analyze_plan(plan); + auto analyze_end = std::chrono::high_resolution_clock::now(); + stats.analyze_plan_ms = + std::chrono::duration_cast(analyze_end - + analyze_start) + .count(); + + auto deferred_result = + execute_deferred_impl(deferred_plan, plan.root, true, stats); + ColumnarTable final_result = + std::get(std::move(deferred_result)); +#else + // Eager materialization path (original) auto result = execute_impl(plan, plan.root, true, stats); + ColumnarTable final_result = std::get(std::move(result)); +#endif auto total_end = std::chrono::high_resolution_clock::now(); auto total_elapsed = std::chrono::duration_cast( @@ -315,11 +685,19 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, stats.total_execution_ms = total_elapsed.count(); if (show_detailed_timing) { - int64_t accounted = - stats.hashtable_build_ms + stats.hash_join_probe_ms + - stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms; + int64_t accounted = stats.hashtable_build_ms + + stats.hash_join_probe_ms + + stats.nested_loop_join_ms + stats.materialize_ms + + stats.setup_ms + stats.intermediate_ms; +#ifdef USE_DEFERRED_MATERIALIZATION + accounted += stats.analyze_plan_ms; +#endif int64_t other = stats.total_execution_ms - accounted; +#ifdef USE_DEFERRED_MATERIALIZATION + std::cout << "[DEFERRED] Plan Analysis Time: " << stats.analyze_plan_ms + << " ms\n"; +#endif std::cout << "Hashtable Build Time: " << stats.hashtable_build_ms << " ms\n"; std::cout << "Hash Join Probe Time: " << stats.hash_join_probe_ms @@ -339,7 +717,7 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, *stats_out = stats; } - return std::move(std::get(result)); + return std::move(final_result); } void *build_context() { return nullptr; } From 665968c6a27f73350f75d1496dd94bc92b898d33 Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 01:47:27 +0200 Subject: [PATCH 03/13] feat: draft degerred materialization --- include/data_model/deferred_intermediate.h | 77 +-- include/data_model/deferred_plan.h | 6 +- include/data_model/intermediate.h | 82 +++ include/foundation/common.h | 49 ++ include/join_execution/match_collector.h | 64 ++ include/join_execution/nested_loop.h | 220 +++++++ include/materialization/construct_deferred.h | 597 ++++++++++++------ .../materialization/materialize_deferred.h | 56 +- include/platform/arena.h | 26 +- src/analyze_plan.cpp | 54 +- src/execute.cpp | 83 +-- 11 files changed, 942 insertions(+), 372 deletions(-) diff --git a/include/data_model/deferred_intermediate.h b/include/data_model/deferred_intermediate.h index 0c16a13..8e183d5 100644 --- a/include/data_model/deferred_intermediate.h +++ b/include/data_model/deferred_intermediate.h @@ -2,9 +2,10 @@ * @file deferred_intermediate.h * @brief Lightweight intermediate result for deferred materialization. * - * DeferredResult stores only materialized columns (join keys) plus row ID - * provenance columns. Deferred columns are resolved at final materialization - * by following row IDs back to base tables. + * DeferredResult stores only materialized columns (join keys) plus + * per-deferred-column provenance using 64-bit encoding (table_id, column_idx, + * row_id). Deferred columns are resolved at final materialization by decoding + * the provenance and reading directly from base tables. * * @see deferred_plan.h for DeferredJoinNode with column decisions. * @see construct_deferred.h for building DeferredResult. @@ -27,14 +28,15 @@ namespace Contest { * * Unlike ExtendedResult which stores all projected columns, DeferredResult * stores only columns marked MATERIALIZE (typically just the parent's join - * key). All other columns are resolved at final materialization using row ID - * provenance. + * key). All other columns are resolved at final materialization using + * per-column 64-bit provenance (table_id, column_idx, row_id). * * Memory savings: For a join projecting N columns where only 1 is a join key, * DeferredResult uses ~1/N the memory of ExtendedResult for data columns. + * Additionally, we only track provenance for deferred columns (not all tables). * * @see DeferredColumnInfo for materialization decisions. - * @see DeferredJoinNode for column provenance tracking. + * @see DeferredProvenance for 64-bit encoding scheme. */ struct DeferredResult { /// Only columns marked MATERIALIZE (typically 1 join key). @@ -44,11 +46,13 @@ struct DeferredResult { /// deferred). std::vector> materialized_map; - /// Row ID tracking for provenance (same as ExtendedResult). - std::vector row_ids; + /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row). + /// One deferred_column_t per DEFER column, stores full provenance per row. + std::vector deferred_columns; - /// Which base tables are tracked (sorted). - std::vector table_ids; + /// Map: original column index → index in deferred_columns (nullopt if + /// materialized). + std::vector> deferred_map; /// Reference to node info for column provenance resolution. const DeferredJoinNode *node_info = nullptr; @@ -71,6 +75,12 @@ struct DeferredResult { materialized_map[orig_idx].has_value(); } + /** @brief Check if column is deferred. */ + bool is_deferred(size_t orig_idx) const { + return orig_idx < deferred_map.size() && + deferred_map[orig_idx].has_value(); + } + /** @brief Get materialized column, or nullptr if deferred. */ const mema::column_t *get_materialized(size_t orig_idx) const { if (!is_materialized(orig_idx)) @@ -78,26 +88,22 @@ struct DeferredResult { return &materialized[*materialized_map[orig_idx]]; } - /** @brief Find row ID column index for a table, or -1 if not found. */ - int find_rowid_index(uint8_t tid) const { - for (size_t i = 0; i < table_ids.size(); ++i) { - if (table_ids[i] == tid) - return static_cast(i); - } - return -1; + /** @brief Get deferred column provenance, or nullptr if materialized. */ + const mema::deferred_column_t *get_deferred(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return nullptr; + return &deferred_columns[*deferred_map[orig_idx]]; } - /** @brief Get row ID column for a table, or nullptr if not found. */ - const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { - int idx = find_rowid_index(tid); - return (idx >= 0) ? &row_ids[idx] : nullptr; + /** @brief Get mutable deferred column provenance, or nullptr. */ + mema::deferred_column_t *get_deferred_mut(size_t orig_idx) { + if (!is_deferred(orig_idx)) + return nullptr; + return &deferred_columns[*deferred_map[orig_idx]]; } - /** @brief Get mutable row ID column for a table, or nullptr. */ - mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) { - int idx = find_rowid_index(tid); - return (idx >= 0) ? &row_ids[idx] : nullptr; - } + /** @brief Number of deferred columns. */ + size_t num_deferred() const { return deferred_columns.size(); } }; /** @@ -150,19 +156,16 @@ struct DeferredInput { return 0; } - /** @brief Get list of tracked table IDs. */ - std::vector tracked_tables() const { - if (is_columnar()) { - return {table_id}; - } - return std::get(data).table_ids; - } - - /** @brief Get row ID column for a table. */ - const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { + /** + * @brief Get deferred column provenance for a column index. + * + * For columnar inputs, returns nullptr (caller must encode fresh). + * For DeferredResult inputs, returns existing provenance column. + */ + const mema::deferred_column_t *get_deferred_column(size_t col_idx) const { if (is_columnar()) return nullptr; - return std::get(data).get_rowid_column(tid); + return std::get(data).get_deferred(col_idx); } }; diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h index 46daa23..abb934f 100644 --- a/include/data_model/deferred_plan.h +++ b/include/data_model/deferred_plan.h @@ -76,7 +76,7 @@ struct DeferredScanNode { * - Which columns to materialize eagerly (join keys for parent) * - Column provenance for deferred resolution * - Pre-computed match collection mode - * - Table IDs tracked through this node + * - Number of deferred columns for allocation */ struct DeferredJoinNode { size_t node_idx; ///< Index in original Plan::nodes. @@ -95,8 +95,8 @@ struct DeferredJoinNode { /// Pre-computed collection mode (assumes build=left; flip if build=right). join::MatchCollectionMode base_collection_mode; - /// Sorted table IDs tracked through this node (union of children). - std::vector tracked_table_ids; + /// Number of deferred columns (for pre-allocation). + size_t num_deferred_columns = 0; /// Column index that parent needs as join key (nullopt if root). std::optional parent_join_key_idx; diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index 5f693a7..f4fa9c8 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -230,6 +230,88 @@ struct rowid_column_t { } }; +/** + * @brief 64-bit provenance column for deferred materialization. + * + * Stores encoded (table_id, column_idx, row_id) for each row using + * DeferredProvenance encoding. Uses 32KB pages with 4096 entries each. + * + * @see DeferredProvenance for encoding scheme. + * @see deferred_intermediate.h for DeferredResult usage. + */ +struct deferred_column_t { + static constexpr size_t PAGE_SIZE = 1 << 15; // 32KB + static constexpr size_t ENTRIES_PER_PAGE = + PAGE_SIZE / sizeof(uint64_t); // 4096 + static constexpr size_t ENTRY_SHIFT = 12; // log2(4096) + static constexpr size_t ENTRY_MASK = ENTRIES_PER_PAGE - 1; + + struct alignas(PAGE_SIZE) Page { + uint64_t data[ENTRIES_PER_PAGE]; + }; + + std::vector pages; + size_t num_values = 0; + + deferred_column_t() = default; + + deferred_column_t(deferred_column_t &&other) noexcept + : pages(std::move(other.pages)), num_values(other.num_values) { + other.pages.clear(); + other.num_values = 0; + } + + deferred_column_t &operator=(deferred_column_t &&other) noexcept { + if (this != &other) { + pages = std::move(other.pages); + num_values = other.num_values; + other.pages.clear(); + other.num_values = 0; + } + return *this; + } + + deferred_column_t(const deferred_column_t &) = delete; + deferred_column_t &operator=(const deferred_column_t &) = delete; + + ~deferred_column_t() = default; + + /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */ + inline uint64_t operator[](size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK]; + } + + /** @brief Thread-safe write at idx (requires pages to be set up first). */ + inline void write_at(size_t idx, uint64_t val) { + pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = val; + } + + /** @brief Total value count. */ + size_t row_count() const { return num_values; } + + /** @brief Set row count without allocation (for assembly pattern). */ + inline void set_row_count(size_t count) { num_values = count; } + + /** @brief Pre-allocate pages from arena. */ + inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, + size_t count) { + static_assert( + sizeof(Page) == + Contest::platform::ChunkSize< + Contest::platform::ChunkType::DEFERRED_PAGE>::value, + "Page size mismatch with DEFERRED_PAGE chunk size"); + size_t pages_needed = (count + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE; + pages.reserve(pages_needed); + for (size_t i = 0; i < pages_needed; ++i) { + void *ptr = + arena + .alloc_chunk(); + pages.push_back(reinterpret_cast(ptr)); + } + num_values = count; + } +}; + /** * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs. * @see materialize.h diff --git a/include/foundation/common.h b/include/foundation/common.h index 192fe08..49967cd 100644 --- a/include/foundation/common.h +++ b/include/foundation/common.h @@ -203,4 +203,53 @@ struct GlobalRowId { static inline uint32_t row(uint32_t encoded) { return encoded & ROW_MASK; } }; +/** + * @brief 64-bit encoding for deferred column provenance. + * + * Encodes table_id, column_idx, and row_id into a single 64-bit value + * for efficient storage and resolution of deferred columns. + * + * Encoding: [table_id (8 bits)][column_idx (8 bits)][row_id (48 bits)] + * - table_id: bits 56-63 + * - column_idx: bits 48-55 + * - row_id: bits 0-47 + * + * Supports up to 256 tables, 256 columns per table, and 281 trillion rows. + */ +struct DeferredProvenance { + static constexpr uint64_t ROW_BITS = 48; + static constexpr uint64_t COLUMN_BITS = 8; + static constexpr uint64_t TABLE_BITS = 8; + + static constexpr uint64_t ROW_MASK = (1ULL << ROW_BITS) - 1; + static constexpr uint64_t COLUMN_MASK = (1ULL << COLUMN_BITS) - 1; + static constexpr uint64_t COLUMN_SHIFT = ROW_BITS; + static constexpr uint64_t TABLE_SHIFT = ROW_BITS + COLUMN_BITS; + + static constexpr uint64_t MAX_TABLES = 1ULL << TABLE_BITS; // 256 + static constexpr uint64_t MAX_COLUMNS = 1ULL << COLUMN_BITS; // 256 + static constexpr uint64_t MAX_ROWS = 1ULL << ROW_BITS; // 281 trillion + + /** @brief Encode table_id, column_idx, row_id into single uint64_t. */ + static inline uint64_t encode(uint8_t table_id, uint8_t column_idx, + uint64_t row_id) { + return (static_cast(table_id) << TABLE_SHIFT) | + (static_cast(column_idx) << COLUMN_SHIFT) | + (row_id & ROW_MASK); + } + + /** @brief Extract table_id from encoded provenance. */ + static inline uint8_t table(uint64_t encoded) { + return static_cast(encoded >> TABLE_SHIFT); + } + + /** @brief Extract column_idx from encoded provenance. */ + static inline uint8_t column(uint64_t encoded) { + return static_cast((encoded >> COLUMN_SHIFT) & COLUMN_MASK); + } + + /** @brief Extract row_id from encoded provenance. */ + static inline uint64_t row(uint64_t encoded) { return encoded & ROW_MASK; } +}; + } // namespace Contest \ No newline at end of file diff --git a/include/join_execution/match_collector.h b/include/join_execution/match_collector.h index 78657b7..a4136cb 100644 --- a/include/join_execution/match_collector.h +++ b/include/join_execution/match_collector.h @@ -160,6 +160,60 @@ class ThreadLocalMatchBuffer { ChainIterator end() const { return ChainIterator(nullptr, 0); } }; + /** + * @brief Batch reader for efficient SIMD access to chunk chains. + * + * Unlike ChainIterator which reads one element at a time, this reader + * provides direct pointer access to contiguous batches within chunks. + * Essential for SIMD provenance encoding in deferred materialization. + */ + class ChunkBatchReader { + IndexChunk *current_chunk; + uint32_t offset; + size_t remaining; + + public: + ChunkBatchReader(IndexChunk *chunk, size_t count) + : current_chunk(chunk), offset(0), remaining(count) {} + + /** @brief Returns true if more data is available. */ + inline bool has_more() const { return remaining > 0 && current_chunk; } + + /** + * @brief Get pointer to contiguous batch of row IDs. + * + * Returns pointer to up to max_batch contiguous elements within + * current chunk. Actual count may be less if chunk boundary reached. + * + * @param max_batch Maximum elements to return. + * @param actual_count Output: actual number of elements available. + * @return Pointer to contiguous row IDs, or nullptr if exhausted. + */ + inline const uint32_t *get_batch(size_t max_batch, + size_t &actual_count) { + if (!current_chunk || remaining == 0) { + actual_count = 0; + return nullptr; + } + + size_t available = current_chunk->count - offset; + actual_count = std::min({max_batch, remaining, available}); + const uint32_t *ptr = ¤t_chunk->ids[offset]; + + offset += static_cast(actual_count); + remaining -= actual_count; + + if (offset >= current_chunk->count && current_chunk->next) { + current_chunk = current_chunk->next; + offset = 0; + } + return ptr; + } + + /** @brief Remaining element count. */ + inline size_t count() const { return remaining; } + }; + /** @brief Returns range for iterating left (build) row IDs. */ inline ChainRange left_range() const { return ChainRange(left_head, total_count); @@ -170,6 +224,16 @@ class ThreadLocalMatchBuffer { return ChainRange(right_head, total_count); } + /** @brief Returns batch reader for left (build) row IDs. */ + inline ChunkBatchReader left_batch_reader() const { + return ChunkBatchReader(left_head, total_count); + } + + /** @brief Returns batch reader for right (probe) row IDs. */ + inline ChunkBatchReader right_batch_reader() const { + return ChunkBatchReader(right_head, total_count); + } + /** @brief Returns match count in this buffer. */ size_t count() const { return total_count; } diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h index 7646639..8546854 100644 --- a/include/join_execution/nested_loop.h +++ b/include/join_execution/nested_loop.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -83,6 +84,59 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx, } } +/** + * @brief Iterates over non-NULL values in a deferred input column. + * + * Abstracts columnar vs DeferredResult input. For DeferredResult, reads from + * materialized columns (join keys are always materialized). + * + * @tparam Func void(uint32_t row_id, int32_t value). + */ +template +inline void visit_deferred_rows(const DeferredInput &input, size_t attr_idx, + Func &&visitor) { + if (input.is_columnar()) { + auto *table = std::get(input.data); + auto [col_idx, _] = input.node->output_attrs[attr_idx]; + const Column &col = table->columns[col_idx]; + + uint32_t row_id = 0; + for (auto *page_obj : col.pages) { + auto *page = page_obj->data; + auto num_rows = *reinterpret_cast(page); + auto num_values = *reinterpret_cast(page + 2); + auto *data = reinterpret_cast(page + 4); + + uint16_t val_idx = 0; + for (uint16_t i = 0; i < num_rows; i++) { + if (num_rows == num_values) { + visitor(row_id++, data[i]); + } else { + auto *bitmap = reinterpret_cast( + page + PAGE_SIZE - (num_rows + 7) / 8); + if (bitmap[i / 8] & (1u << (i % 8))) { + visitor(row_id, data[val_idx++]); + } + row_id++; + } + } + } + } else { + const auto &res = std::get(input.data); + // Join key must be materialized + const mema::column_t *col = res.get_materialized(attr_idx); + if (!col) + return; // Should not happen - join keys are always materialized + size_t count = col->row_count(); + for (size_t i = 0; i < count; i++) { + const mema::value_t &val = (*col)[i]; + if (!val.is_null()) { + visitor(static_cast(i), val.value); + } + } + } +} + /** * @brief Nested loop join for small build tables (<=8 rows). * @@ -239,4 +293,170 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, return buffers; } +/** + * @brief Nested loop join for deferred execution path. + * + * Same algorithm as nested_loop_join but works with DeferredInput. + * Supports both columnar and DeferredResult inputs. + * + * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY). + * @return Thread-local match buffers for direct iteration. + */ +template +inline std::vector> +nested_loop_join_deferred(const DeferredInput &build_input, + const DeferredInput &probe_input, size_t build_attr, + size_t probe_attr) { + size_t build_rows = build_input.row_count(build_attr); + size_t probe_rows = probe_input.row_count(probe_attr); + + if (build_rows == 0 || probe_rows == 0) + return {}; + + size_t num_threads = THREAD_COUNT; + std::vector> buffers(num_threads); + + constexpr size_t MAX_BUILD_SIZE = 8; + alignas(32) int32_t b_vals[MAX_BUILD_SIZE]; + alignas(16) uint32_t b_ids[MAX_BUILD_SIZE]; + size_t b_count = 0; + + auto collect_build = [&](uint32_t id, int32_t val) { + if (b_count < MAX_BUILD_SIZE) { + b_ids[b_count] = id; + b_vals[b_count] = val; + b_count++; + } + }; + + visit_deferred_rows(build_input, build_attr, collect_build); + + for (size_t i = b_count; i < MAX_BUILD_SIZE; ++i) { + b_vals[i] = INT32_MIN; + } + + // Setup for columnar probe (page-based parallel processing) + const Column *probe_col = nullptr; + platform::ArenaVector page_offsets( + Contest::platform::get_arena(0)); + if (probe_input.is_columnar()) { + auto *table = std::get(probe_input.data); + auto [col_idx, _] = probe_input.node->output_attrs[probe_attr]; + probe_col = &table->columns[col_idx]; + + page_offsets.reserve(probe_col->pages.size() + 1); + uint32_t current = 0; + for (auto *p : probe_col->pages) { + page_offsets.push_back(current); + current += *reinterpret_cast(p->data); + } + page_offsets.push_back(current); + } + + // Setup for DeferredResult probe + const mema::column_t *probe_mat_col = nullptr; + if (!probe_input.is_columnar()) { + const auto &res = std::get(probe_input.data); + probe_mat_col = res.get_materialized(probe_attr); + if (!probe_mat_col) + return {}; // Join key not materialized - should not happen + } + + std::atomic probe_page_counter{0}; + + worker_pool().execute([&](size_t t_id) { + buffers[t_id] = + ThreadLocalMatchBuffer(Contest::platform::get_arena(t_id)); + auto &local_buffer = buffers[t_id]; + + auto process_value = [&](uint32_t p_id, int32_t p_val) { + simd::eq_scan_build(p_id, p_val, b_vals, b_ids, b_count, + local_buffer); + }; + + if (probe_input.is_columnar()) { + size_t num_pages = probe_col->pages.size(); + + while (true) { + size_t i = + probe_page_counter.fetch_add(1, std::memory_order_relaxed); + + if (i >= num_pages) + break; + auto *page = probe_col->pages[i]->data; + auto num_rows = *reinterpret_cast(page); + auto num_values = *reinterpret_cast(page + 2); + auto *data = reinterpret_cast(page + 4); + uint32_t row_id = page_offsets[i]; + + if (num_rows == num_values) { + // SIMD batch: process multiple probe values at a time + uint16_t j = simd::eq_batch_columnar( + data, num_rows, row_id, b_vals, b_ids, b_count, + local_buffer); + row_id += j; + // Handle remaining elements with scalar + for (; j < num_rows; j++) { + process_value(row_id++, data[j]); + } + } else { + auto *bitmap = reinterpret_cast( + page + PAGE_SIZE - (num_rows + 7) / 8); + uint16_t val_idx = 0; + for (uint16_t j = 0; j < num_rows; j++) { + if (bitmap[j / 8] & (1u << (j % 8))) { + process_value(row_id, data[val_idx++]); + } + row_id++; + } + } + } + } else { + // DeferredResult probe - use materialized column + const mema::column_t &col = *probe_mat_col; + size_t count = col.row_count(); + size_t start = (t_id * count) / THREAD_COUNT; + size_t end = ((t_id + 1) * count) / THREAD_COUNT; + + constexpr size_t BATCH_SIZE = simd::INTERMEDIATE_BATCH_SIZE; + size_t i = start; + + if constexpr (BATCH_SIZE > 0) { + // SIMD batch processing + for (; i + BATCH_SIZE <= end; i += BATCH_SIZE) { + size_t page_idx = i >> 12; + size_t offset = i & 0xFFF; + + // Only use SIMD if all values are on same page + if (offset + BATCH_SIZE <= mema::CAP_PER_PAGE) { + const int32_t *vals = reinterpret_cast( + &col.pages[page_idx]->data[offset]); + simd::eq_batch_intermediate( + vals, i, b_vals, b_ids, b_count, local_buffer); + } else { + // Cross-page boundary: fall back to scalar + for (size_t j = i; j < i + BATCH_SIZE; j++) { + const mema::value_t &val = col[j]; + if (!val.is_null()) { + process_value(static_cast(j), + val.value); + } + } + } + } + } + + // Handle remaining elements (or all elements if no SIMD) + for (; i < end; i++) { + const mema::value_t &val = col[i]; + if (!val.is_null()) { + process_value(static_cast(i), val.value); + } + } + } + }); + + return buffers; +} + } // namespace Contest::join diff --git a/include/materialization/construct_deferred.h b/include/materialization/construct_deferred.h index 5ba8b3e..bb9a425 100644 --- a/include/materialization/construct_deferred.h +++ b/include/materialization/construct_deferred.h @@ -3,8 +3,14 @@ * @brief Constructs deferred intermediate results for multi-way joins. * * Allocates and populates DeferredResult with only MATERIALIZE columns - * (typically just the parent's join key). Row ID columns are always - * populated for provenance tracking. + * (typically just the parent's join key). Deferred columns store 64-bit + * provenance (table_id, column_idx, row_id) for resolution at final output. + * + * Optimized with: + * - Column-major iteration for cache locality + * - Precomputed source metadata to avoid per-row variant access + * - SIMD provenance encoding (AVX2/NEON) for deferred columns + * - Batch access to match collector chunks * * @see construct_intermediate.h for the eager materialization equivalent. * @see materialize_deferred.h for final resolution of deferred columns. @@ -23,6 +29,12 @@ #include #include +#if defined(__x86_64__) +#include +#elif defined(__aarch64__) +#include +#endif + namespace Contest { namespace materialize { @@ -32,6 +44,164 @@ using Contest::join::ThreadLocalMatchBuffer; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; +// ============================================================================ +// SIMD Provenance Encoding +// ============================================================================ + +namespace simd_provenance { + +#if defined(__x86_64__) && defined(__AVX2__) +inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit) +#elif defined(__aarch64__) +inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit) +#else +inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available +#endif + +/** + * @brief Encode provenance for batch of row IDs using SIMD. + * + * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row. + * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback. + * + * @param dest Destination deferred column + * @param start_idx Starting output index + * @param row_ids Pointer to row IDs (from IndexChunk, contiguous) + * @param count Number of row IDs to process + * @param table_id Base table ID (constant for all rows) + * @param column_idx Base column index (constant for all rows) + * @return Number of rows processed (always == count) + */ +inline size_t encode_provenance_batch(mema::deferred_column_t &dest, + size_t start_idx, const uint32_t *row_ids, + size_t count, uint8_t table_id, + uint8_t column_idx) { + // Precompute constant prefix: (table_id << 56) | (column_idx << 48) + const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0); + + size_t i = 0; + +#if defined(__x86_64__) && defined(__AVX2__) + // AVX2: Process 4 x uint64_t at a time + // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix + const __m256i prefix_vec = _mm256_set1_epi64x(static_cast(prefix)); + + for (; i + 4 <= count; i += 4) { + // Load 4 x uint32_t and zero-extend to 4 x uint64_t + __m128i rows_32 = + _mm_loadu_si128(reinterpret_cast(row_ids + i)); + __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32); + + // OR with prefix to create provenance values + __m256i result = _mm256_or_si256(rows_64, prefix_vec); + + // Store to aligned buffer, then write individually (page-safe) + alignas(32) uint64_t out[4]; + _mm256_store_si256(reinterpret_cast<__m256i *>(out), result); + + dest.write_at(start_idx + i, out[0]); + dest.write_at(start_idx + i + 1, out[1]); + dest.write_at(start_idx + i + 2, out[2]); + dest.write_at(start_idx + i + 3, out[3]); + } +#elif defined(__aarch64__) + // NEON: Process 2 x uint64_t at a time + const uint64x2_t prefix_vec = vdupq_n_u64(prefix); + + for (; i + 2 <= count; i += 2) { + // Load 2 x uint32_t and zero-extend to 2 x uint64_t + uint32x2_t rows_32 = vld1_u32(row_ids + i); + uint64x2_t rows_64 = vmovl_u32(rows_32); + + // OR with prefix + uint64x2_t result = vorrq_u64(rows_64, prefix_vec); + + // Store individually (page boundary safe) + dest.write_at(start_idx + i, vgetq_lane_u64(result, 0)); + dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1)); + } +#endif + + // Scalar remainder + for (; i < count; ++i) { + dest.write_at(start_idx + i, + prefix | static_cast(row_ids[i])); + } + + return count; +} + +/** + * @brief Copy provenance from source column using batch reads. + * + * Copies existing 64-bit provenance values from child intermediate. + * Uses contiguous batch access for better cache behavior. + * + * @param dest Destination deferred column + * @param start_idx Starting output index + * @param src Source deferred column (from child) + * @param row_ids Row indices into source column + * @param count Number of rows to copy + * @return Number of rows processed (always == count) + */ +inline size_t copy_provenance_batch(mema::deferred_column_t &dest, + size_t start_idx, + const mema::deferred_column_t &src, + const uint32_t *row_ids, size_t count) { + for (size_t i = 0; i < count; ++i) { + dest.write_at(start_idx + i, src[row_ids[i]]); + } + return count; +} + +} // namespace simd_provenance + +// ============================================================================ +// Source Precomputation Structures +// ============================================================================ + +/** + * @brief Precomputed metadata for deferred column sources. + * + * Tracks where each deferred column's provenance comes from: + * - For columnar inputs: encode fresh (table_id, column_idx, row_id) + * - For DeferredResult inputs: copy existing provenance from child + */ +struct DeferredColumnSource { + const mema::deferred_column_t *source_col = + nullptr; ///< Source if from intermediate. + uint8_t base_table_id = 0; ///< Base table ID for encoding. + uint8_t base_column_idx = 0; ///< Base column index for encoding. + bool from_build = false; ///< True if from build side. + bool needs_encode = false; ///< True if columnar (needs fresh encode). +}; + +/** + * @brief Precomputed metadata for materialized column sources. + * + * Eliminates per-row std::variant access and conditional checks in hot loop. + * Mirrors SourceInfo from construct_intermediate.h but for deferred path. + */ +struct alignas(8) MaterializedColumnSource { + const mema::column_t *intermediate_col = + nullptr; ///< Source if from DeferredResult materialized + const Column *columnar_col = nullptr; ///< Source if from ColumnarTable + const mema::deferred_column_t *deferred_resolve_col = + nullptr; ///< Source if needs deferred resolution + size_t child_output_idx = 0; ///< Index in child's output + size_t mat_col_idx = 0; ///< Index in result.materialized[] + DataType type = DataType::INT32; + uint8_t base_table_id = 0; ///< For VARCHAR source tracking + uint8_t base_column_idx = 0; ///< For VARCHAR source tracking + bool is_columnar = false; ///< True if source is ColumnarTable + bool from_build = false; ///< True if from build side + bool needs_deferred_resolve = false; ///< True if child deferred this column +}; + +// ============================================================================ +// Helper Functions +// ============================================================================ + /** * @brief Collect columns needed from a DeferredInput for page index building. */ @@ -92,8 +262,6 @@ inline void prepare_deferred_columns( } // Mark columns needed based on materialization decisions - // from_left refers to original left child - // build_is_left tells us if build side is the left child for (const auto &col : join_node.columns) { if (col.resolution == ColumnResolution::MATERIALIZE) { bool from_build = (col.from_left == build_is_left); @@ -119,9 +287,6 @@ inline void prepare_deferred_columns( /** * @brief Create empty deferred result with proper schema. - * - * Used when total_matches == 0. Creates empty materialized columns - * for columns marked MATERIALIZE so they can be used in subsequent joins. */ inline DeferredResult create_empty_deferred_result(const DeferredJoinNode &node) { @@ -129,89 +294,126 @@ create_empty_deferred_result(const DeferredJoinNode &node) { result.node_info = &node; result.num_rows = 0; result.materialized_map.resize(node.columns.size(), std::nullopt); - result.table_ids = node.tracked_table_ids; + result.deferred_map.resize(node.columns.size(), std::nullopt); - // Count and allocate empty materialized columns size_t mat_count = 0; + size_t def_count = 0; for (const auto &col : node.columns) { if (col.resolution == ColumnResolution::MATERIALIZE) { result.materialized_map[col.original_idx] = mat_count++; + } else { + result.deferred_map[col.original_idx] = def_count++; } } result.materialized.resize(mat_count); - // Each column has 0 rows, which is valid for empty result - - // Also create empty row ID columns - result.row_ids.resize(node.tracked_table_ids.size()); - for (size_t i = 0; i < node.tracked_table_ids.size(); ++i) { - result.row_ids[i].table_id = node.tracked_table_ids[i]; - } + result.deferred_columns.resize(def_count); return result; } /** - * @brief Precomputed metadata for row ID column sources. - * - * Mirrors RowIdSource from construct_intermediate.h but adapted for - * DeferredInput. + * @brief Prepare deferred column sources for intermediate construction. */ -struct DeferredRowIdSource { - const mema::rowid_column_t *source_col = - nullptr; ///< Source if from intermediate. - uint8_t table_id = 0; ///< Table ID for encoding. - bool from_build = false; ///< True if from build side. - bool needs_encode = false; ///< True if columnar (needs GlobalRowId encode). -}; +inline std::vector +prepare_deferred_sources(const DeferredJoinNode &join_node, + const DeferredInput &build_input, + const DeferredInput &probe_input, bool build_is_left) { + std::vector sources; + sources.reserve(join_node.num_deferred_columns); -/** - * @brief Prepare row ID sources for deferred intermediate construction. - */ -inline std::vector -prepare_deferred_rowid_sources(const std::vector &merged_table_ids, - const DeferredInput &build_input, - const DeferredInput &probe_input) { - std::vector sources; - sources.reserve(merged_table_ids.size()); - - for (uint8_t tid : merged_table_ids) { - DeferredRowIdSource src; - src.table_id = tid; - - // Check build side first - auto build_tables = build_input.tracked_tables(); - bool in_build = std::find(build_tables.begin(), build_tables.end(), - tid) != build_tables.end(); - if (in_build) { - src.from_build = true; - if (build_input.is_columnar()) { + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::DEFER) + continue; + + DeferredColumnSource src; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.needs_encode = true; + src.source_col = nullptr; + } else { + const auto *child_def = + src_input.get_deferred_column(col.child_output_idx); + if (child_def) { + src.needs_encode = false; + src.source_col = child_def; + } else { src.needs_encode = true; src.source_col = nullptr; - } else { - src.needs_encode = false; - src.source_col = build_input.get_rowid_column(tid); } + } + sources.push_back(src); + } + return sources; +} + +/** + * @brief Precompute materialized column sources for column-major iteration. + * + * For each MATERIALIZE column, determines source type and caches pointers + * to avoid per-row std::variant access in the hot loop. + */ +inline std::vector prepare_materialized_sources( + const DeferredJoinNode &join_node, const DeferredInput &build_input, + const DeferredInput &probe_input, bool build_is_left) { + std::vector sources; + sources.reserve(join_node.columns.size()); + + size_t mat_idx = 0; + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + + MaterializedColumnSource src; + src.mat_col_idx = mat_idx++; + src.child_output_idx = col.child_output_idx; + src.type = col.type; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.is_columnar = true; + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col.child_output_idx]; + src.columnar_col = &table->columns[actual_idx]; } else { - // Must be from probe side - src.from_build = false; - if (probe_input.is_columnar()) { - src.needs_encode = true; - src.source_col = nullptr; - } else { - src.needs_encode = false; - src.source_col = probe_input.get_rowid_column(tid); + src.is_columnar = false; + const auto &ir = std::get(src_input.data); + + if (ir.is_materialized(col.child_output_idx)) { + src.intermediate_col = + ir.get_materialized(col.child_output_idx); + } else if (ir.is_deferred(col.child_output_idx)) { + src.needs_deferred_resolve = true; + src.deferred_resolve_col = + ir.get_deferred(col.child_output_idx); } } sources.push_back(src); } + return sources; } +// ============================================================================ +// Main Construction Function +// ============================================================================ + /** * @brief Constructs deferred intermediate result from thread-local buffers. * + * Optimized with column-major iteration and SIMD provenance encoding. * Only materializes columns marked MATERIALIZE in the DeferredJoinNode. - * All row ID columns are populated for provenance tracking. + * Deferred columns store 64-bit provenance encoding for resolution at final + * output. * * @tparam Mode Collection mode for compile-time specialization. * @param buffers Thread-local match buffers from probe. @@ -220,11 +422,10 @@ prepare_deferred_rowid_sources(const std::vector &merged_table_ids, * @param join_node Deferred join node with materialization decisions. * @param remapped_attrs Output attributes (after build/probe remapping). * @param build_output_size Number of columns from build side. + * @param build_is_left True if build side is the original left child. * @param columnar_reader Reader for columnar data access. * @param out_result Output DeferredResult (populated in-place). - * @param merged_table_ids Sorted table IDs to track. - * @param deferred_plan Full deferred plan for base table access (deferred - * resolution). + * @param deferred_plan Full deferred plan for base table access. */ template void construct_deferred_from_buffers( @@ -234,10 +435,9 @@ void construct_deferred_from_buffers( const std::vector> &remapped_attrs, size_t build_output_size, bool build_is_left, ColumnarReader &columnar_reader, DeferredResult &out_result, - const std::vector &merged_table_ids, const DeferredPlan &deferred_plan) { - // Count total matches + // Count total matches and compute buffer start offsets size_t total_matches = 0; std::vector buffer_starts(buffers.size()); for (size_t i = 0; i < buffers.size(); ++i) { @@ -250,85 +450,85 @@ void construct_deferred_from_buffers( return; } + // Initialize result metadata out_result.node_info = &join_node; out_result.num_rows = total_matches; - out_result.table_ids = merged_table_ids; - - // Build materialized_map: count MATERIALIZE columns and create mapping - // materialized_map[original_idx] -> index into out_result.materialized out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); - size_t mat_count = 0; + out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); - // Iterate over join_node.columns (which uses original output order) - // and assign materialized indices to MATERIALIZE columns + size_t mat_count = 0; + size_t def_count = 0; for (const auto &col : join_node.columns) { if (col.resolution == ColumnResolution::MATERIALIZE) { out_result.materialized_map[col.original_idx] = mat_count++; + } else { + out_result.deferred_map[col.original_idx] = def_count++; } } - // Prepare row ID sources - auto rowid_sources = prepare_deferred_rowid_sources( - merged_table_ids, build_input, probe_input); - - const size_t num_rowid_cols = rowid_sources.size(); + // Precompute sources for column-major iteration + auto mat_sources = prepare_materialized_sources(join_node, build_input, + probe_input, build_is_left); + auto deferred_sources = prepare_deferred_sources( + join_node, build_input, probe_input, build_is_left); // Pre-allocate pages using Page = mema::column_t::Page; - using RowIdPage = mema::rowid_column_t::Page; - size_t total_pages_needed = + using DeferredPage = mema::deferred_column_t::Page; + size_t mat_pages_needed = (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; + size_t def_pages_needed = + (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) / + mema::deferred_column_t::ENTRIES_PER_PAGE; - // Allocate materialized columns out_result.materialized.resize(mat_count); for (size_t c = 0; c < mat_count; ++c) { - out_result.materialized[c].pages.resize(total_pages_needed); + out_result.materialized[c].pages.resize(mat_pages_needed); out_result.materialized[c].set_row_count(total_matches); } - // Allocate row ID columns - out_result.row_ids.resize(num_rowid_cols); - for (size_t r = 0; r < num_rowid_cols; ++r) { - out_result.row_ids[r].table_id = merged_table_ids[r]; - out_result.row_ids[r].pages.resize(total_pages_needed); - out_result.row_ids[r].set_row_count(total_matches); + out_result.deferred_columns.resize(def_count); + for (size_t d = 0; d < def_count; ++d) { + out_result.deferred_columns[d].pages.resize(def_pages_needed); + out_result.deferred_columns[d].set_row_count(total_matches); + } + + // Set source metadata for materialized columns + for (const auto &src : mat_sources) { + out_result.materialized[src.mat_col_idx].source_table = + src.base_table_id; + out_result.materialized[src.mat_col_idx].source_column = + src.base_column_idx; } - // Parallel page allocation const size_t num_threads = THREAD_COUNT; + + // Parallel page allocation worker_pool().execute([&](size_t t) { for (size_t c = 0; c < mat_count; ++c) { auto &col = out_result.materialized[c]; - for (size_t p = t; p < total_pages_needed; p += num_threads) { + for (size_t p = t; p < mat_pages_needed; p += num_threads) { void *ptr = Contest::platform::get_arena(t) .alloc_chunk(); col.pages[p] = reinterpret_cast(ptr); } } - for (size_t r = 0; r < num_rowid_cols; ++r) { - auto &rid_col = out_result.row_ids[r]; - for (size_t p = t; p < total_pages_needed; p += num_threads) { + for (size_t d = 0; d < def_count; ++d) { + auto &def_col = out_result.deferred_columns[d]; + for (size_t p = t; p < def_pages_needed; p += num_threads) { void *ptr = Contest::platform::get_arena(t) - .alloc_chunk(); - rid_col.pages[p] = reinterpret_cast(ptr); + .alloc_chunk< + Contest::platform::ChunkType::DEFERRED_PAGE>(); + def_col.pages[p] = reinterpret_cast(ptr); } } }); - // Set source metadata for materialized columns - for (const auto &col : join_node.columns) { - if (col.resolution == ColumnResolution::MATERIALIZE) { - size_t mat_idx = *out_result.materialized_map[col.original_idx]; - out_result.materialized[mat_idx].source_table = - col.provenance.base_table_id; - out_result.materialized[mat_idx].source_column = - col.provenance.base_column_idx; - } - } - - // Parallel population: each thread processes its own buffer + // ======================================================================== + // COLUMN-MAJOR PARALLEL POPULATION + // ======================================================================== worker_pool().execute([&](size_t t) { if (t >= buffers.size()) return; @@ -340,102 +540,105 @@ void construct_deferred_from_buffers( size_t start = buffer_starts[t]; ColumnarReader::Cursor cursor; - // Iterate through matches - auto left_it = buf.left_range().begin(); - auto right_it = buf.right_range().begin(); - - for (size_t m = 0; m < my_count; ++m) { - uint32_t build_row = 0, probe_row = 0; - - if constexpr (Mode == MatchCollectionMode::BOTH) { - build_row = *left_it; - probe_row = *right_it; - ++left_it; - ++right_it; - } else if constexpr (Mode == MatchCollectionMode::LEFT_ONLY) { - build_row = *left_it; - ++left_it; - } else { - probe_row = *right_it; - ++right_it; - } - - size_t out_row = start + m; - - // Write materialized columns - for (const auto &col : join_node.columns) { - if (col.resolution != ColumnResolution::MATERIALIZE) - continue; - - size_t mat_col_idx = - *out_result.materialized_map[col.original_idx]; - auto &out_col = out_result.materialized[mat_col_idx]; - - // Determine source based on from_left and build/probe mapping - // col.from_left refers to original left child - // build_is_left tells us if build side is the left child - // If from_left && build_is_left => from build - // If from_left && !build_is_left => from probe (left became - // probe) - bool from_build = (col.from_left == build_is_left); - uint32_t src_row = from_build ? build_row : probe_row; - const auto &src_input = from_build ? build_input : probe_input; - - mema::value_t val; - if (src_input.is_columnar()) { - const auto *table = - std::get(src_input.data); - auto [actual_idx, _] = - src_input.node->output_attrs[col.child_output_idx]; - val = columnar_reader.read_value( - table->columns[actual_idx], col.child_output_idx, - src_row, col.type, cursor, from_build); - } else { - const auto &ir = std::get(src_input.data); - // Check if materialized in child - const auto *src_col = - ir.get_materialized(col.child_output_idx); - if (src_col) { - val = (*src_col)[src_row]; + // ==================================================================== + // Process MATERIALIZED columns (column-major for cache locality) + // ==================================================================== + for (const auto &src : mat_sources) { + auto &dest_col = out_result.materialized[src.mat_col_idx]; + + // Get appropriate range based on which side this column comes from + auto range = src.from_build ? buf.left_range() : buf.right_range(); + + if (src.is_columnar) { + // Columnar source - use ColumnarReader with cursor caching + const auto &col = *src.columnar_col; + size_t k = start; + for (uint32_t rid : range) { + dest_col.write_at(k++, + columnar_reader.read_value( + col, src.child_output_idx, rid, + src.type, cursor, src.from_build)); + } + } else if (src.intermediate_col) { + // Intermediate materialized source - direct copy + const auto &vec = *src.intermediate_col; + size_t k = start; + for (uint32_t rid : range) { + dest_col.write_at(k++, vec[rid]); + } + } else if (src.needs_deferred_resolve && src.deferred_resolve_col) { + // Deferred in child - resolve via provenance + const auto &def_col = *src.deferred_resolve_col; + size_t k = start; + for (uint32_t rid : range) { + uint64_t prov = def_col[rid]; + uint8_t base_tid = DeferredProvenance::table(prov); + uint8_t base_col = DeferredProvenance::column(prov); + uint64_t base_row = DeferredProvenance::row(prov); + + if (deferred_plan.original_plan) [[likely]] { + const auto &base_table = + deferred_plan.original_plan->inputs[base_tid]; + mema::value_t val = + columnar_reader.read_value_direct_public( + base_table.columns[base_col], + static_cast(base_row), src.type); + dest_col.write_at(k++, val); } else { - // Deferred - resolve via row ID to base table - // This should only happen if materialization wasn't - // propagated properly. Use direct read as fallback. - const auto *rowid_col = - ir.get_rowid_column(col.provenance.base_table_id); - if (rowid_col && deferred_plan.original_plan) { - uint32_t encoded = (*rowid_col)[src_row]; - uint32_t base_row = GlobalRowId::row(encoded); - const auto &base_table = - deferred_plan.original_plan - ->inputs[col.provenance.base_table_id]; - val = columnar_reader.read_value_direct_public( - base_table - .columns[col.provenance.base_column_idx], - base_row, col.type); - } else { - val = mema::value_t{mema::value_t::NULL_VALUE}; - } + dest_col.write_at( + k++, mema::value_t{mema::value_t::NULL_VALUE}); } } - - out_col.write_at(out_row, val); } + } - // Write row ID columns - for (size_t r = 0; r < num_rowid_cols; ++r) { - const auto &rid_src = rowid_sources[r]; - auto &dest_rid_col = out_result.row_ids[r]; - - uint32_t local_idx = rid_src.from_build ? build_row : probe_row; - - if (rid_src.needs_encode) { - dest_rid_col.write_at( - out_row, - GlobalRowId::encode(rid_src.table_id, local_idx)); - } else if (rid_src.source_col) { - dest_rid_col.write_at(out_row, - (*rid_src.source_col)[local_idx]); + // ==================================================================== + // Process DEFERRED columns (column-major with SIMD batch encoding) + // ==================================================================== + for (size_t d = 0; d < deferred_sources.size(); ++d) { + const auto &def_src = deferred_sources[d]; + auto &dest_def_col = out_result.deferred_columns[d]; + + if (def_src.needs_encode) { + // Fresh encoding from columnar input - use SIMD batch + auto batch_reader = def_src.from_build + ? buf.left_batch_reader() + : buf.right_batch_reader(); + + size_t k = start; + while (batch_reader.has_more()) { + size_t batch_count; + // Request larger batches for SIMD efficiency + constexpr size_t MAX_BATCH = + simd_provenance::BATCH_SIZE > 0 ? 64 : 256; + const uint32_t *row_ids = + batch_reader.get_batch(MAX_BATCH, batch_count); + + if (batch_count > 0) { + simd_provenance::encode_provenance_batch( + dest_def_col, k, row_ids, batch_count, + def_src.base_table_id, def_src.base_column_idx); + k += batch_count; + } + } + } else if (def_src.source_col) { + // Copy existing provenance from child intermediate + auto batch_reader = def_src.from_build + ? buf.left_batch_reader() + : buf.right_batch_reader(); + + size_t k = start; + while (batch_reader.has_more()) { + size_t batch_count; + const uint32_t *row_ids = + batch_reader.get_batch(256, batch_count); + + if (batch_count > 0) { + simd_provenance::copy_provenance_batch( + dest_def_col, k, *def_src.source_col, row_ids, + batch_count); + k += batch_count; + } } } } diff --git a/include/materialization/materialize_deferred.h b/include/materialization/materialize_deferred.h index 8b548c0..bd7a2af 100644 --- a/include/materialization/materialize_deferred.h +++ b/include/materialization/materialize_deferred.h @@ -3,7 +3,8 @@ * @brief Final materialization for deferred execution path. * * Materializes all output columns at the root join, resolving deferred - * columns by following row ID provenance back to base tables. + * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back + * to base tables. * * @see construct_deferred.h for building DeferredResult intermediates. * @see materialize.h for the eager materialization equivalent. @@ -136,7 +137,7 @@ inline ColumnarTable create_empty_deferred_final( * Handles three cases: * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index * 2. MATERIALIZED: Column was materialized in DeferredResult - * 3. DEFERRED: Resolve via row ID lookup to base table + * 3. DEFERRED: Resolve via 64-bit provenance to base table * * @tparam Mode Collection mode for compile-time specialization. * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. @@ -249,7 +250,8 @@ inline void materialize_deferred_column( /** * @brief Materialize single output column handling deferred resolution. * - * For deferred columns, resolves via row ID tracking back to base table. + * For deferred columns, resolves via 64-bit provenance encoding back to + * base table. * * @tparam Mode Collection mode for compile-time specialization. */ @@ -276,20 +278,13 @@ inline void materialize_single_deferred_column( } // Determine if this column comes from build or probe side at runtime - // col_info->from_left refers to original left child - // build_is_left tells us if build side is the left child - // If from_left && build_is_left => from build - // If from_left && !build_is_left => from probe (left became probe) - // If !from_left && build_is_left => from probe (right is probe) - // If !from_left && !build_is_left => from build (right became build) bool from_build = (col_info->from_left == build_is_left); const DeferredInput &src_input = from_build ? build_input : probe_input; // Determine how to read the value const Column *columnar_source = nullptr; const mema::column_t *materialized_source = nullptr; - const mema::rowid_column_t *rowid_source = nullptr; - const Column *base_table_column = nullptr; + const mema::deferred_column_t *deferred_source = nullptr; if (src_input.is_columnar()) { // Direct columnar read @@ -303,16 +298,9 @@ inline void materialize_single_deferred_column( // Read from materialized column materialized_source = ir.get_materialized(col_info->child_output_idx); - } else { - // Deferred - need to resolve via row ID - rowid_source = - ir.get_rowid_column(col_info->provenance.base_table_id); - if (deferred_plan.original_plan) { - base_table_column = - &deferred_plan.original_plan - ->inputs[col_info->provenance.base_table_id] - .columns[col_info->provenance.base_column_idx]; - } + } else if (ir.is_deferred(col_info->child_output_idx)) { + // Deferred - need to resolve via 64-bit provenance + deferred_source = ir.get_deferred(col_info->child_output_idx); } } @@ -325,13 +313,17 @@ inline void materialize_single_deferred_column( col_info->type, cursor, from_build); } else if (materialized_source) { return (*materialized_source)[local_row_id]; - } else if (rowid_source && base_table_column) { - // Deferred resolution: get base table row from encoded row ID - uint32_t encoded = (*rowid_source)[local_row_id]; - uint32_t base_row = GlobalRowId::row(encoded); + } else if (deferred_source && deferred_plan.original_plan) { + // Deferred resolution: decode 64-bit provenance + uint64_t prov = (*deferred_source)[local_row_id]; + uint8_t base_tid = DeferredProvenance::table(prov); + uint8_t base_col = DeferredProvenance::column(prov); + uint64_t base_row = DeferredProvenance::row(prov); + const auto &base_table = + deferred_plan.original_plan->inputs[base_tid]; return columnar_reader.read_value( - *base_table_column, col_info->provenance.base_column_idx, - base_row, col_info->type, cursor, true); + base_table.columns[base_col], base_col, + static_cast(base_row), col_info->type, cursor, true); } return mema::value_t{mema::value_t::NULL_VALUE}; }; @@ -357,8 +349,12 @@ inline void materialize_single_deferred_column( str_src_ptr = &deferred_plan.original_plan ->inputs[materialized_source->source_table] .columns[materialized_source->source_column]; - } else if (base_table_column) { - str_src_ptr = base_table_column; + } else if (deferred_source && deferred_plan.original_plan) { + // For deferred VARCHAR, get source from provenance of first row + // All rows in a deferred column share the same base table/column + str_src_ptr = &deferred_plan.original_plan + ->inputs[col_info->provenance.base_table_id] + .columns[col_info->provenance.base_column_idx]; } } @@ -383,7 +379,7 @@ inline void materialize_single_deferred_column( * @brief Materialize all output columns from deferred intermediate. * * For root join in deferred execution path. Resolves all deferred columns - * by following row ID provenance to base tables. + * by decoding 64-bit provenance to base tables. * * @tparam Mode Collection mode for compile-time specialization. * @param buffers Thread-local match buffers from probe. diff --git a/include/platform/arena.h b/include/platform/arena.h index f1aa32e..59d3442 100644 --- a/include/platform/arena.h +++ b/include/platform/arena.h @@ -41,12 +41,13 @@ static constexpr size_t PAGE_2MB = 2 * 1024 * 1024; * @brief Chunk type enumeration for arena regions. */ enum class ChunkType : uint8_t { - HASH_CHUNK = 0, ///< 4KB - hash table partition chunks - IR_PAGE = 1, ///< 16KB - intermediate result pages - INDEX_CHUNK = 2, ///< 32KB - match collector index chunks - GENERAL = 3, ///< Variable - misc allocations + HASH_CHUNK = 0, ///< 4KB - hash table partition chunks + IR_PAGE = 1, ///< 16KB - intermediate result pages (32-bit values) + INDEX_CHUNK = 2, ///< 32KB - match collector index chunks + DEFERRED_PAGE = 3, ///< 32KB - deferred provenance pages (64-bit values) + GENERAL = 4, ///< Variable - misc allocations - NUM_TYPES = 4 + NUM_TYPES = 5 }; // ============================================================================ @@ -67,12 +68,15 @@ template <> struct ChunkSize { template <> struct ChunkSize { static constexpr size_t value = 32768; }; +template <> struct ChunkSize { + static constexpr size_t value = 32768; +}; template <> struct ChunkSize { static constexpr size_t value = 0; }; /// Runtime chunk size array indexed by ChunkType. -inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 0}; +inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 32768, 0}; // ============================================================================ // Page Policies @@ -92,6 +96,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = { PagePolicy::SMALL_PAGES, // HASH_CHUNK PagePolicy::HUGE_PAGES, // IR_PAGE PagePolicy::HUGE_PAGES, // INDEX_CHUNK + PagePolicy::HUGE_PAGES, // DEFERRED_PAGE PagePolicy::HUGE_PAGES, // GENERAL }; @@ -102,7 +107,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = { /** * @brief Region size configuration based on available DRAM. * - * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (25%) among 4 regions. + * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (20%) among 5 regions. */ struct RegionConfig { size_t total_arena_bytes; @@ -113,8 +118,8 @@ struct RegionConfig { 1024ULL * 1024ULL * 3ULL / 4ULL; } - /// Get total size for a region (25% each). - size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 4; } + /// Get total size for a region (20% each). + size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 5; } /// Get total arena size. size_t total() const { return total_arena_bytes; } @@ -450,7 +455,8 @@ class ArenaManager { // Global Instance and Helper // ============================================================================ -/// Global arena manager instance (inline global, constructed at program startup). +/// Global arena manager instance (inline global, constructed at program +/// startup). inline ArenaManager g_arena_manager{}; /// Get thread arena by thread ID. diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp index f0ef0a8..f511c60 100644 --- a/src/analyze_plan.cpp +++ b/src/analyze_plan.cpp @@ -135,34 +135,6 @@ compute_base_collection_mode(const std::vector &columns, return join::MatchCollectionMode::BOTH; } -/** - * @brief Collect tracked table IDs from a DeferredNode. - */ -std::vector get_tracked_tables(const DeferredNode &node) { - if (const auto *scan = std::get_if(&node)) { - return {scan->base_table_id}; - } - return std::get(node).tracked_table_ids; -} - -/** - * @brief Merge tracked table IDs from two children (sorted, unique). - */ -std::vector merge_table_ids(const DeferredNode &left, - const DeferredNode &right) { - auto left_ids = get_tracked_tables(left); - auto right_ids = get_tracked_tables(right); - - std::vector result; - result.reserve(left_ids.size() + right_ids.size()); - - std::merge(left_ids.begin(), left_ids.end(), right_ids.begin(), - right_ids.end(), std::back_inserter(result)); - - result.erase(std::unique(result.begin(), result.end()), result.end()); - return result; -} - } // anonymous namespace DeferredPlan analyze_plan(const Plan &plan) { @@ -264,11 +236,17 @@ DeferredPlan analyze_plan(const Plan &plan) { djoin.columns.push_back(std::move(info)); } - // Compute collection mode and tracked tables + // Compute collection mode and count deferred columns djoin.base_collection_mode = compute_base_collection_mode(djoin.columns, left_size); - djoin.tracked_table_ids = merge_table_ids( - deferred.nodes[join.left], deferred.nodes[join.right]); + + // Count deferred columns for pre-allocation + djoin.num_deferred_columns = 0; + for (const auto &col : djoin.columns) { + if (col.resolution == ColumnResolution::DEFER) { + ++djoin.num_deferred_columns; + } + } deferred.nodes[node_idx] = std::move(djoin); } @@ -305,6 +283,20 @@ DeferredPlan analyze_plan(const Plan &plan) { } } + // PASS 3: Recount num_deferred_columns after propagation + for (size_t node_idx : post_order) { + auto *djoin = std::get_if(&deferred.nodes[node_idx]); + if (!djoin) + continue; + + djoin->num_deferred_columns = 0; + for (const auto &col : djoin->columns) { + if (col.resolution == ColumnResolution::DEFER) { + ++djoin->num_deferred_columns; + } + } + } + return deferred; } diff --git a/src/execute.cpp b/src/execute.cpp index d8ef462..b9d45e5 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -379,54 +379,14 @@ DeferredJoinResult execute_deferred_join_with_mode( const UnchainedHashtable *hash_table, const DeferredInput &build_input, const DeferredInput &probe_input, const BuildProbeConfig &config, const DeferredJoinNode &join_node, io::ColumnarReader &columnar_reader, - const DeferredPlan &deferred_plan, - const std::vector &merged_table_ids, TimingStats &stats) { + const DeferredPlan &deferred_plan, TimingStats &stats) { std::vector> match_buffers; - // Probe phase - need to convert DeferredInput to JoinInput for probing - // For now, handle columnar probe directly if (use_nested_loop) { auto nested_loop_start = std::chrono::high_resolution_clock::now(); - // Nested loop requires JoinInput - create adapter - JoinInput build_ji, probe_ji; - build_ji.node = build_input.node; - probe_ji.node = probe_input.node; - - if (build_input.is_columnar()) { - build_ji.data = std::get(build_input.data); - build_ji.table_id = build_input.table_id; - } else { - // Convert DeferredResult to ExtendedResult for compatibility - // This is a limitation - nested loop path falls back to eager - const auto &dr = std::get(build_input.data); - ExtendedResult er; - er.columns = std::move( - const_cast &>(dr.materialized)); - er.row_ids = std::move( - const_cast &>(dr.row_ids)); - er.table_ids = dr.table_ids; - build_ji.data = std::move(er); - build_ji.table_id = 0; - } - - if (probe_input.is_columnar()) { - probe_ji.data = std::get(probe_input.data); - probe_ji.table_id = probe_input.table_id; - } else { - const auto &dr = std::get(probe_input.data); - ExtendedResult er; - er.columns = std::move( - const_cast &>(dr.materialized)); - er.row_ids = std::move( - const_cast &>(dr.row_ids)); - er.table_ids = dr.table_ids; - probe_ji.data = std::move(er); - probe_ji.table_id = 0; - } - - match_buffers = nested_loop_join( - build_ji, probe_ji, config.build_attr, config.probe_attr); + match_buffers = nested_loop_join_deferred( + build_input, probe_input, config.build_attr, config.probe_attr); auto nested_loop_end = std::chrono::high_resolution_clock::now(); stats.nested_loop_join_ms += std::chrono::duration_cast( @@ -508,8 +468,7 @@ DeferredJoinResult execute_deferred_join_with_mode( construct_deferred_from_buffers( match_buffers, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), - config.build_left, columnar_reader, result, merged_table_ids, - deferred_plan); + config.build_left, columnar_reader, result, deferred_plan); } else { result = create_empty_deferred_result(join_node); } @@ -559,15 +518,9 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, const size_t HASH_TABLE_THRESHOLD = 8; size_t build_rows = build_input.row_count(config.build_attr); - // Nested loop doesn't work with DeferredResult because it only has join - // keys materialized. Force hash join when either side is DeferredResult. - bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD) && - build_is_columnar && probe_is_columnar; - - // Merge table IDs - auto build_tables = build_input.tracked_tables(); - auto probe_tables = probe_input.tracked_tables(); - auto merged_table_ids = merge_tracked_tables(build_tables, probe_tables); + // Use nested loop for small build tables - works with both columnar and + // DeferredResult inputs (join keys are always materialized). + bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD); io::ColumnarReader columnar_reader; auto setup_end = std::chrono::high_resolution_clock::now(); @@ -575,10 +528,15 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, setup_end - setup_start) .count(); - // For deferred materialization, we always need BOTH row indices because - // we track provenance from both sides for deferred column resolution. - // The optimization to collect only one side's indices is not safe here. - MatchCollectionMode mode = MatchCollectionMode::BOTH; + // Use pre-computed collection mode from plan analysis. + // base_collection_mode assumes build=left; flip if build=right at runtime. + MatchCollectionMode mode = djoin.base_collection_mode; + if (!config.build_left) { + if (mode == MatchCollectionMode::LEFT_ONLY) + mode = MatchCollectionMode::RIGHT_ONLY; + else if (mode == MatchCollectionMode::RIGHT_ONLY) + mode = MatchCollectionMode::LEFT_ONLY; + } // Build hash table if needed std::optional hash_table; @@ -618,22 +576,19 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, return execute_deferred_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, djoin, columnar_reader, deferred_plan, - merged_table_ids, stats); + probe_input, config, djoin, columnar_reader, deferred_plan, stats); case MatchCollectionMode::LEFT_ONLY: return execute_deferred_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, djoin, columnar_reader, deferred_plan, - merged_table_ids, stats); + probe_input, config, djoin, columnar_reader, deferred_plan, stats); case MatchCollectionMode::RIGHT_ONLY: return execute_deferred_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, djoin, columnar_reader, deferred_plan, - merged_table_ids, stats); + probe_input, config, djoin, columnar_reader, deferred_plan, stats); } return DeferredResult{}; From c7ccae48ab90b6f9da646e04143d8d27ae82de46 Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 02:19:04 +0200 Subject: [PATCH 04/13] chore: integrating deferred materialization --- CMakeLists.txt | 9 - include/data_model/deferred_intermediate.h | 172 ---- include/data_model/deferred_plan.h | 36 +- include/data_model/intermediate.h | 308 +++---- include/join_execution/hash_join.h | 32 +- include/join_execution/join_setup.h | 268 +------ include/join_execution/nested_loop.h | 220 +---- include/materialization/construct_deferred.h | 649 --------------- .../materialization/construct_intermediate.h | 749 ++++++++++++------ include/materialization/materialize.h | 328 +++++--- .../materialization/materialize_deferred.h | 435 ---------- src/analyze_plan.cpp | 95 ++- src/execute.cpp | 448 ++--------- 13 files changed, 1097 insertions(+), 2652 deletions(-) delete mode 100644 include/data_model/deferred_intermediate.h delete mode 100644 include/materialization/construct_deferred.h delete mode 100644 include/materialization/materialize_deferred.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2621d56..fc63a09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,15 +54,6 @@ FetchContent_Declare( FetchContent_MakeAvailable(fmtlib) -set(ENABLE_SANITIZER OFF) -set(ENABLE_UBSAN OFF) - -# Deferred materialization: only materialize join keys, defer other columns -option(USE_DEFERRED_MATERIALIZATION "Enable deferred column materialization" OFF) -if(USE_DEFERRED_MATERIALIZATION) - message(STATUS "Deferred materialization ENABLED") - add_compile_definitions(USE_DEFERRED_MATERIALIZATION) -endif() if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le") message("Disabling jemalloc extension of DuckDB on Power.") set(SKIP_EXTENSIONS jemalloc) diff --git a/include/data_model/deferred_intermediate.h b/include/data_model/deferred_intermediate.h deleted file mode 100644 index 8e183d5..0000000 --- a/include/data_model/deferred_intermediate.h +++ /dev/null @@ -1,172 +0,0 @@ -/** - * @file deferred_intermediate.h - * @brief Lightweight intermediate result for deferred materialization. - * - * DeferredResult stores only materialized columns (join keys) plus - * per-deferred-column provenance using 64-bit encoding (table_id, column_idx, - * row_id). Deferred columns are resolved at final materialization by decoding - * the provenance and reading directly from base tables. - * - * @see deferred_plan.h for DeferredJoinNode with column decisions. - * @see construct_deferred.h for building DeferredResult. - * @see materialize_deferred.h for final resolution. - */ -#pragma once - -#include -#include -#include -#include - -#include -#include - -namespace Contest { - -/** - * @brief Lightweight intermediate result with only join keys materialized. - * - * Unlike ExtendedResult which stores all projected columns, DeferredResult - * stores only columns marked MATERIALIZE (typically just the parent's join - * key). All other columns are resolved at final materialization using - * per-column 64-bit provenance (table_id, column_idx, row_id). - * - * Memory savings: For a join projecting N columns where only 1 is a join key, - * DeferredResult uses ~1/N the memory of ExtendedResult for data columns. - * Additionally, we only track provenance for deferred columns (not all tables). - * - * @see DeferredColumnInfo for materialization decisions. - * @see DeferredProvenance for 64-bit encoding scheme. - */ -struct DeferredResult { - /// Only columns marked MATERIALIZE (typically 1 join key). - std::vector materialized; - - /// Map: original column index → index in materialized (nullopt if - /// deferred). - std::vector> materialized_map; - - /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row). - /// One deferred_column_t per DEFER column, stores full provenance per row. - std::vector deferred_columns; - - /// Map: original column index → index in deferred_columns (nullopt if - /// materialized). - std::vector> deferred_map; - - /// Reference to node info for column provenance resolution. - const DeferredJoinNode *node_info = nullptr; - - /// Total row count. - size_t num_rows = 0; - - DeferredResult() = default; - DeferredResult(DeferredResult &&) = default; - DeferredResult &operator=(DeferredResult &&) = default; - DeferredResult(const DeferredResult &) = delete; - DeferredResult &operator=(const DeferredResult &) = delete; - - /** @brief Total row count. */ - size_t row_count() const { return num_rows; } - - /** @brief Check if column was materialized (not deferred). */ - bool is_materialized(size_t orig_idx) const { - return orig_idx < materialized_map.size() && - materialized_map[orig_idx].has_value(); - } - - /** @brief Check if column is deferred. */ - bool is_deferred(size_t orig_idx) const { - return orig_idx < deferred_map.size() && - deferred_map[orig_idx].has_value(); - } - - /** @brief Get materialized column, or nullptr if deferred. */ - const mema::column_t *get_materialized(size_t orig_idx) const { - if (!is_materialized(orig_idx)) - return nullptr; - return &materialized[*materialized_map[orig_idx]]; - } - - /** @brief Get deferred column provenance, or nullptr if materialized. */ - const mema::deferred_column_t *get_deferred(size_t orig_idx) const { - if (!is_deferred(orig_idx)) - return nullptr; - return &deferred_columns[*deferred_map[orig_idx]]; - } - - /** @brief Get mutable deferred column provenance, or nullptr. */ - mema::deferred_column_t *get_deferred_mut(size_t orig_idx) { - if (!is_deferred(orig_idx)) - return nullptr; - return &deferred_columns[*deferred_map[orig_idx]]; - } - - /** @brief Number of deferred columns. */ - size_t num_deferred() const { return deferred_columns.size(); } -}; - -/** - * @brief Input abstraction for deferred execution path. - * - * Similar to JoinInput but works with DeferredResult instead of ExtendedResult. - * Provides uniform interface for columnar (base table) and deferred - * intermediate data sources. - */ -struct DeferredInput { - /// Either base table pointer or owned DeferredResult. - std::variant data; - - /// Original plan node for output_attrs mapping. - const PlanNode *node = nullptr; - - /// Deferred plan node for materialization decisions. - const DeferredNode *deferred_node = nullptr; - - /// Base table ID (for columnar inputs). - uint8_t table_id = 0; - - /** @brief True if data is columnar (base table). */ - bool is_columnar() const { - return std::holds_alternative(data); - } - - /** @brief Row count for join key column. */ - size_t row_count(size_t col_idx) const { - if (is_columnar()) { - const auto *table = std::get(data); - return table->num_rows; - } - return std::get(data).row_count(); - } - - /** @brief Total row count. */ - size_t row_count() const { - if (is_columnar()) { - const auto *table = std::get(data); - return table->num_rows; - } - return std::get(data).row_count(); - } - - /** @brief Number of output columns. */ - size_t output_size() const { - if (node) - return node->output_attrs.size(); - return 0; - } - - /** - * @brief Get deferred column provenance for a column index. - * - * For columnar inputs, returns nullptr (caller must encode fresh). - * For DeferredResult inputs, returns existing provenance column. - */ - const mema::deferred_column_t *get_deferred_column(size_t col_idx) const { - if (is_columnar()) - return nullptr; - return std::get(data).get_deferred(col_idx); - } -}; - -} // namespace Contest diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h index abb934f..13be4dd 100644 --- a/include/data_model/deferred_plan.h +++ b/include/data_model/deferred_plan.h @@ -1,14 +1,14 @@ /** * @file deferred_plan.h - * @brief Analyzed plan with materialization decisions for deferred execution. + * @brief Analyzed plan with materialization decisions for execution. * - * DeferredPlan mirrors the original Plan structure but includes pre-computed + * AnalyzedPlan mirrors the original Plan structure but includes pre-computed * decisions about which columns to materialize eagerly (join keys) vs defer - * until final output. Each DeferredJoinNode tracks column provenance back to + * until final output. Each AnalyzedJoinNode tracks column provenance back to * base tables for efficient deferred resolution. * * @see analyze_plan.cpp for the analysis algorithm. - * @see deferred_intermediate.h for the runtime result format. + * @see intermediate.h for the runtime result format. */ #pragma once @@ -42,12 +42,12 @@ struct ColumnProvenance { }; /** - * @brief Complete metadata for an output column in a deferred join. + * @brief Complete metadata for an output column in a join. * * Combines materialization decision, provenance tracking, and child source * information for efficient intermediate construction and final resolution. */ -struct DeferredColumnInfo { +struct AnalyzedColumnInfo { size_t original_idx; ///< Index in node's output_attrs. DataType type; ///< INT32 or VARCHAR. @@ -59,11 +59,11 @@ struct DeferredColumnInfo { }; /** - * @brief Analyzed scan node for deferred execution. + * @brief Analyzed scan node for execution. * * Wraps a ScanNode with output attribute information. */ -struct DeferredScanNode { +struct AnalyzedScanNode { size_t node_idx; ///< Index in original Plan::nodes. uint8_t base_table_id; ///< Index into Plan::inputs. std::vector> output_attrs; ///< Projected cols. @@ -72,13 +72,13 @@ struct DeferredScanNode { /** * @brief Analyzed join node with pre-computed materialization decisions. * - * Contains all information needed for deferred execution: + * Contains all information needed for execution: * - Which columns to materialize eagerly (join keys for parent) * - Column provenance for deferred resolution * - Pre-computed match collection mode * - Number of deferred columns for allocation */ -struct DeferredJoinNode { +struct AnalyzedJoinNode { size_t node_idx; ///< Index in original Plan::nodes. size_t left_child_idx; ///< Left child index in Plan::nodes. @@ -90,7 +90,7 @@ struct DeferredJoinNode { std::vector> output_attrs; /// Per-column materialization decisions and provenance. - std::vector columns; + std::vector columns; /// Pre-computed collection mode (assumes build=left; flip if build=right). join::MatchCollectionMode base_collection_mode; @@ -106,9 +106,9 @@ struct DeferredJoinNode { }; /** - * @brief Plan node variant for deferred execution. + * @brief Plan node variant for execution. */ -using DeferredNode = std::variant; +using AnalyzedNode = std::variant; /** * @brief Analyzed plan with materialization decisions. @@ -117,12 +117,12 @@ using DeferredNode = std::variant; * materialization. The original_plan pointer provides access to base tables * for value resolution. */ -struct DeferredPlan { - std::vector nodes; ///< Analyzed nodes (same indices as Plan). +struct AnalyzedPlan { + std::vector nodes; ///< Analyzed nodes (same indices as Plan). size_t root; ///< Root node index. const Plan *original_plan; ///< Non-owning reference to original plan. - const DeferredNode &operator[](size_t idx) const { return nodes[idx]; } + const AnalyzedNode &operator[](size_t idx) const { return nodes[idx]; } }; /** @@ -135,8 +135,8 @@ struct DeferredPlan { * 4. Pre-computed collection mode based on output columns * * @param plan Original query plan. - * @return DeferredPlan with materialization decisions. + * @return AnalyzedPlan with materialization decisions. */ -DeferredPlan analyze_plan(const Plan &plan); +AnalyzedPlan analyze_plan(const Plan &plan); } // namespace Contest diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index f4fa9c8..4a29919 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -1,18 +1,30 @@ /** * @file intermediate.h - * @brief Intermediate join format: VARCHAR as page/offset refs (no string - * copy). + * @brief Intermediate join result types and input abstraction. * - * Base tables must outlive execution. @see plan.h ColumnarTable, - * construct_intermediate.h + * Provides: + * - mema::value_t: 4-byte value encoding (INT32 direct, VARCHAR as page/offset) + * - mema::column_t: 16KB-paged column for materialized values + * - mema::deferred_column_t: 32KB-paged column for 64-bit provenance encoding + * - IntermediateResult: Lightweight result with selective materialization + * - JoinInput: Unified abstraction over columnar tables and intermediate + * results + * + * Base tables must outlive execution. + * + * @see plan.h for ColumnarTable, construct_intermediate.h for building results. + * @see deferred_plan.h for AnalyzedJoinNode with column decisions. */ #pragma once #include #include +#include #include #include +#include #include +#include #include /** @@ -20,13 +32,17 @@ * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages). * * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated - * pages with write_at(). @see Contest::ExecuteResult, plan.h ColumnarTable. + * pages with write_at(). + * + * @see Contest::IntermediateResult, plan.h ColumnarTable. */ namespace mema { /** * @brief 4-byte value: INT32 direct, VARCHAR packed (19-bit page + 13-bit - * offset), NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while + * offset). + * + * NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while * source exists. */ struct alignas(4) value_t { @@ -45,16 +61,18 @@ struct alignas(4) value_t { offset_idx = (static_cast(encoded) >> 19) & 0x1FFF; } - static constexpr int32_t LONG_STRING_OFFSET = - 0x1FFF; /**< Sentinel for long strings. */ - static constexpr int32_t NULL_VALUE = - INT32_MIN; /**< NULL sentinel for both types. */ + /** @brief Sentinel for long strings. */ + static constexpr int32_t LONG_STRING_OFFSET = 0x1FFF; + + /** @brief NULL sentinel for both types. */ + static constexpr int32_t NULL_VALUE = INT32_MIN; /** @brief Check if this value represents NULL. */ inline bool is_null() const { return value == NULL_VALUE; } }; -/** @brief Page size for intermediate results (16KB, larger than ColumnarTable). +/** + * @brief Page size for intermediate results (16KB, larger than ColumnarTable). */ constexpr size_t IR_PAGE_SIZE = 1 << 14; @@ -82,9 +100,12 @@ struct column_t { public: std::vector pages; /**< Pointers to arena-allocated pages. */ - uint8_t source_table = - 0; /**< Base table index for VARCHAR dereferencing. */ - uint8_t source_column = 0; /**< Column index within source table. */ + + /** @brief Base table index for VARCHAR dereferencing. */ + uint8_t source_table = 0; + + /** @brief Column index within source table. */ + uint8_t source_column = 0; public: column_t() = default; @@ -114,8 +135,10 @@ struct column_t { ~column_t() = default; - /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. No bounds - * check. */ + /** + * @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. + * @note No bounds check. + */ inline const value_t &operator[](size_t idx) const { return pages[idx >> 12]->data[idx & 0xFFF]; } @@ -152,84 +175,6 @@ struct column_t { /** @brief Alias for a collection of intermediate columns. */ using Columnar = std::vector; -/** - * @brief Row ID column storing encoded global row IDs. - * - * Parallel structure to column_t but stores uint32_t (encoded table_id + - * row_id). One column per base table participating in joins up to this point. - * Uses same page size and arena allocation as column_t. - * - * @see GlobalRowId for encoding scheme, ExtendedResult for usage. - */ -struct rowid_column_t { - /** @brief Page for row ID storage: fixed array of uint32_t entries. */ - struct alignas(IR_PAGE_SIZE) Page { - uint32_t data[CAP_PER_PAGE]; - }; - - std::vector pages; ///< Pointers to arena-allocated pages. - size_t num_values = 0; ///< Total row ID count across all pages. - uint8_t table_id = 0; ///< Which base table this column tracks. - - rowid_column_t() = default; - - rowid_column_t(rowid_column_t &&other) noexcept - : pages(std::move(other.pages)), num_values(other.num_values), - table_id(other.table_id) { - other.pages.clear(); - other.num_values = 0; - } - - rowid_column_t &operator=(rowid_column_t &&other) noexcept { - if (this != &other) { - pages = std::move(other.pages); - num_values = other.num_values; - table_id = other.table_id; - other.pages.clear(); - other.num_values = 0; - } - return *this; - } - - rowid_column_t(const rowid_column_t &) = delete; - rowid_column_t &operator=(const rowid_column_t &) = delete; - - ~rowid_column_t() = default; - - /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */ - inline uint32_t operator[](size_t idx) const { - return pages[idx >> 12]->data[idx & 0xFFF]; - } - - /** @brief Thread-safe write at idx (requires pages to be set up first). */ - inline void write_at(size_t idx, uint32_t val) { - pages[idx >> 12]->data[idx & 0xFFF] = val; - } - - /** @brief Total row ID count. */ - size_t row_count() const { return num_values; } - - /** @brief Set row count without allocation (for assembly pattern). */ - inline void set_row_count(size_t count) { num_values = count; } - - /** @brief Pre-allocate pages from arena. */ - inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, - size_t count) { - static_assert(sizeof(Page) == - Contest::platform::ChunkSize< - Contest::platform::ChunkType::IR_PAGE>::value, - "Page size mismatch with IR_PAGE chunk size"); - size_t pages_needed = (count + CAP_PER_PAGE - 1) / CAP_PER_PAGE; - pages.reserve(pages_needed); - for (size_t i = 0; i < pages_needed; ++i) { - void *ptr = - arena.alloc_chunk(); - pages.push_back(reinterpret_cast(ptr)); - } - num_values = count; - } -}; - /** * @brief 64-bit provenance column for deferred materialization. * @@ -237,7 +182,7 @@ struct rowid_column_t { * DeferredProvenance encoding. Uses 32KB pages with 4096 entries each. * * @see DeferredProvenance for encoding scheme. - * @see deferred_intermediate.h for DeferredResult usage. + * @see IntermediateResult for usage. */ struct deferred_column_t { static constexpr size_t PAGE_SIZE = 1 << 15; // 32KB @@ -312,67 +257,156 @@ struct deferred_column_t { } }; +} // namespace mema + +namespace Contest { + /** - * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs. - * @see materialize.h + * @brief Lightweight intermediate result with selective materialization. + * + * Stores only columns marked MATERIALIZE (typically just the parent's join + * key). All other columns are resolved at final materialization using + * per-column 64-bit provenance (table_id, column_idx, row_id). + * + * Memory savings: For a join projecting N columns where only 1 is a join key, + * IntermediateResult uses ~1/N the memory for data columns. Additionally, we + * only track provenance for deferred columns (not all tables). + * + * @see AnalyzedColumnInfo for materialization decisions. + * @see DeferredProvenance for 64-bit encoding scheme. */ -ColumnarTable to_columnar(const Columnar &table, const Plan &plan); -} /* namespace mema */ +struct IntermediateResult { + /// Only columns marked MATERIALIZE (typically 1 join key). + std::vector materialized; -/** @namespace Contest @brief Contest API. @see Plan, execute.cpp */ -namespace Contest { -/** @brief Result type for non-root joins (intermediate format). */ -using ExecuteResult = std::vector; + /// Map: original column index -> index in materialized (nullopt if + /// deferred). + std::vector> materialized_map; + + /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row). + /// One deferred_column_t per DEFER column, stores full provenance per row. + std::vector deferred_columns; + + /// Map: original column index -> index in deferred_columns (nullopt if + /// materialized). + std::vector> deferred_map; + + /// Reference to node info for column provenance resolution. + const AnalyzedJoinNode *node_info = nullptr; + + /// Total row count. + size_t num_rows = 0; + + IntermediateResult() = default; + IntermediateResult(IntermediateResult &&) = default; + IntermediateResult &operator=(IntermediateResult &&) = default; + IntermediateResult(const IntermediateResult &) = delete; + IntermediateResult &operator=(const IntermediateResult &) = delete; + + /** @brief Total row count. */ + size_t row_count() const { return num_rows; } + + /** @brief Check if column was materialized (not deferred). */ + bool is_materialized(size_t orig_idx) const { + return orig_idx < materialized_map.size() && + materialized_map[orig_idx].has_value(); + } + + /** @brief Check if column is deferred. */ + bool is_deferred(size_t orig_idx) const { + return orig_idx < deferred_map.size() && + deferred_map[orig_idx].has_value(); + } + + /** @brief Get materialized column, or nullptr if deferred. */ + const mema::column_t *get_materialized(size_t orig_idx) const { + if (!is_materialized(orig_idx)) + return nullptr; + return &materialized[*materialized_map[orig_idx]]; + } + + /** @brief Get deferred column provenance, or nullptr if materialized. */ + const mema::deferred_column_t *get_deferred(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return nullptr; + return &deferred_columns[*deferred_map[orig_idx]]; + } + + /** @brief Get mutable deferred column provenance, or nullptr. */ + mema::deferred_column_t *get_deferred_mut(size_t orig_idx) { + if (!is_deferred(orig_idx)) + return nullptr; + return &deferred_columns[*deferred_map[orig_idx]]; + } + + /** @brief Number of deferred columns. */ + size_t num_deferred() const { return deferred_columns.size(); } +}; /** - * @brief Extended intermediate result with row ID tracking. + * @brief Unified abstraction over columnar tables and intermediate results. * - * Wraps ExecuteResult with parallel row ID columns that track - * which original scan rows contributed to each intermediate row. - * One rowid_column_t per base table participating in the join tree. + * Stores ColumnarTable* (base scans) or IntermediateResult (child joins). + * Provides uniform interface for columnar (base table) and intermediate + * data sources. * - * @see GlobalRowId for encoding, construct_intermediate.h for population. + * @see IntermediateResult for intermediate join results. + * @see ColumnarTable for base table storage. */ -struct ExtendedResult { - ExecuteResult columns; ///< Data columns (value_t). - std::vector row_ids; ///< One per participating table. - std::vector table_ids; ///< Which tables are tracked (sorted). +struct JoinInput { + /// Either base table pointer or owned IntermediateResult. + std::variant data; - ExtendedResult() = default; + /// Original plan node for output_attrs mapping. + const PlanNode *node = nullptr; - ExtendedResult(ExtendedResult &&) = default; - ExtendedResult &operator=(ExtendedResult &&) = default; + /// Analyzed plan node for materialization decisions. + const AnalyzedNode *analyzed_node = nullptr; - ExtendedResult(const ExtendedResult &) = delete; - ExtendedResult &operator=(const ExtendedResult &) = delete; + /// Base table ID (for columnar inputs). + uint8_t table_id = 0; - /** @brief Total row count (from first data column). */ - size_t row_count() const { - return columns.empty() ? 0 : columns[0].row_count(); + /** @brief True if data is columnar (base table). */ + bool is_columnar() const { + return std::holds_alternative(data); } - /** @brief Find row ID column index for a specific table, or -1 if not - * found. */ - int find_rowid_index(uint8_t tid) const { - for (size_t i = 0; i < table_ids.size(); ++i) { - if (table_ids[i] == tid) - return static_cast(i); + /** @brief Row count for join key column. */ + size_t row_count(size_t col_idx) const { + if (is_columnar()) { + const auto *table = std::get(data); + return table->num_rows; } - return -1; + return std::get(data).row_count(); } - /** @brief Get row ID column for a specific table, or nullptr if not found. - */ - const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { - int idx = find_rowid_index(tid); - return (idx >= 0) ? &row_ids[idx] : nullptr; + /** @brief Total row count. */ + size_t row_count() const { + if (is_columnar()) { + const auto *table = std::get(data); + return table->num_rows; + } + return std::get(data).row_count(); } - /** @brief Get mutable row ID column for a specific table, or nullptr. */ - mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) { - int idx = find_rowid_index(tid); - return (idx >= 0) ? &row_ids[idx] : nullptr; + /** @brief Number of output columns. */ + size_t output_size() const { + if (node) + return node->output_attrs.size(); + return 0; + } + + /** + * @brief Get deferred column provenance for a column index. + * + * For columnar inputs, returns nullptr (caller must encode fresh). + * For IntermediateResult inputs, returns existing provenance column. + */ + const mema::deferred_column_t *get_deferred_column(size_t col_idx) const { + if (is_columnar()) + return nullptr; + return std::get(data).get_deferred(col_idx); } }; -} /* namespace Contest */ +} // namespace Contest diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h index b2f1f00..0e75ccf 100644 --- a/include/join_execution/hash_join.h +++ b/include/join_execution/hash_join.h @@ -1,12 +1,3 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - /** * @file hash_join.h * @brief Hash join build and probe operations. @@ -19,6 +10,13 @@ * * @see hashtable.h, match_collector.h */ +#pragma once + +#include +#include +#include +#include +#include /** * @namespace Contest::join @@ -27,8 +25,6 @@ */ namespace Contest::join { -using Contest::ExecuteResult; -using Contest::ExtendedResult; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; @@ -53,16 +49,22 @@ inline UnchainedHashtable build_from_columnar(const JoinInput &input, /** * @brief Build hash table from intermediate results (column_t). * - * Uses join key column from ExecuteResult produced by prior pipeline stages. + * Uses join key column from IntermediateResult produced by prior pipeline + * stages. */ inline UnchainedHashtable build_from_intermediate(const JoinInput &input, size_t attr_idx) { - const auto &result = std::get(input.data); - const auto &column = result.columns[attr_idx]; + const auto &result = std::get(input.data); + // Get the materialized column for the join key + const auto *column = result.get_materialized(attr_idx); + if (!column) { + // This should never happen - join keys must be materialized + std::abort(); + } size_t row_count = input.row_count(attr_idx); UnchainedHashtable hash_table(row_count); - hash_table.build_intermediate(column, 8); + hash_table.build_intermediate(*column, 8); return hash_table; } diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h index 188873d..217995e 100644 --- a/include/join_execution/join_setup.h +++ b/include/join_execution/join_setup.h @@ -1,87 +1,24 @@ /** * @file join_setup.h - * @brief Join configuration and input abstraction. + * @brief Join configuration and build/probe side selection. * - * Provides JoinInput to abstract over columnar and intermediate data sources, - * and utilities for selecting build/probe sides and preparing output columns. + * Provides utilities for selecting build/probe sides and determining + * which row IDs to collect based on output columns. */ #pragma once -#include #include #include #include #include -#include #include /** * @namespace Contest::join - * @brief JoinInput abstraction, build/probe selection, output column setup. + * @brief Build/probe selection and collection mode determination. */ namespace Contest::join { -using Contest::ExecuteResult; -using Contest::ExtendedResult; -using Contest::io::ColumnarReader; - -/** - * @brief Unified abstraction over columnar tables and intermediate results. - * - * Stores ColumnarTable* (base scans) or ExtendedResult (child joins). Node - * provides output_attrs mapping for column resolution. - */ -struct JoinInput { - std::variant data; - const PlanNode *node; /**< Provides output_attrs for column mapping. */ - uint8_t table_id; /**< Source table ID for provenance tracking. */ - - /** @brief True if data is columnar (base table), false if intermediate. */ - bool is_columnar() const { - return std::holds_alternative(data); - } - - /** - * @brief Row count for a given output column. - * @param col_idx Index into node->output_attrs. - */ - size_t row_count(size_t col_idx) const { - if (is_columnar()) { - auto *table = std::get(data); - auto [actual_col_idx, _] = node->output_attrs[col_idx]; - return table->num_rows; - } else { - return std::get(data).columns[col_idx].row_count(); - } - } - - /** @brief Number of output columns. */ - size_t output_size() const { return node->output_attrs.size(); } - - /** - * @brief Get list of tables whose row IDs are tracked in this input. - * - * For columnar input: returns {table_id}. - * For intermediate: returns the tracked table_ids from ExtendedResult. - */ - std::vector tracked_tables() const { - if (is_columnar()) { - return {table_id}; - } - return std::get(data).table_ids; - } - - /** - * @brief Get row ID column for a specific table. - * @return nullptr for columnar inputs (row IDs encoded on-the-fly). - */ - const mema::rowid_column_t *get_rowid_column(uint8_t tid) const { - if (is_columnar()) - return nullptr; - return std::get(data).get_rowid_column(tid); - } -}; - /** * @brief Configuration for build/probe side assignment. * @@ -99,17 +36,6 @@ struct BuildProbeConfig { size_t probe_attr; /**< Join key index in probe's output_attrs. */ }; -/** @brief Resolves global output column index to source input. */ -inline std::tuple -resolve_input_source(size_t global_idx, size_t split_point, - const JoinInput &input_a, const PlanNode &node_a, - const JoinInput &input_b, const PlanNode &node_b) { - if (global_idx < split_point) { - return {input_a, node_a, global_idx}; - } - return {input_b, node_b, global_idx - split_point}; -} - /** * @brief Chooses build/probe sides based on cardinality. * @@ -180,190 +106,4 @@ inline MatchCollectionMode determine_collection_mode( return MatchCollectionMode::BOTH; } -/** - * @brief Creates output columns with provenance metadata from inputs. - */ -inline ExtendedResult initialize_output_columns( - const std::vector> &output_attrs, - const PlanNode &left_node, const PlanNode &right_node, - const JoinInput &left_input, const JoinInput &right_input, - size_t estimated_rows) { - ExtendedResult results; - results.columns.reserve(output_attrs.size()); - size_t left_size = left_input.output_size(); - - auto set_column_metadata = [](mema::column_t &col, const JoinInput &input, - const PlanNode &node, size_t col_idx) { - auto [actual_col_idx, _] = node.output_attrs[col_idx]; - if (input.is_columnar()) { - col.source_table = input.table_id; - col.source_column = actual_col_idx; - } else { - const auto &result = std::get(input.data); - col.source_table = result.columns[col_idx].source_table; - col.source_column = result.columns[col_idx].source_column; - } - }; - - for (size_t i = 0; i < output_attrs.size(); ++i) { - auto [col_idx, _] = output_attrs[i]; - auto [input, node, local_idx] = resolve_input_source( - col_idx, left_size, left_input, left_node, right_input, right_node); - - mema::column_t col; - set_column_metadata(col, input, node, local_idx); - results.columns.push_back(std::move(col)); - } - - return results; -} - -/** - * @brief Join output state and columnar reader. - * - * prepared flag implements lazy PageIndex construction. - */ -struct JoinSetup { - ExtendedResult results; /**< Output columns + row ID columns. */ - ColumnarReader - columnar_reader; /**< Page cursor caching for columnar access. */ - std::vector merged_table_ids; /**< Tables tracked in output. */ - /** - * True after prepare_output_columns called. - */ - bool prepared; - - JoinSetup() : prepared(false) {} -}; - -/** - * @brief Merge tracked table IDs from build and probe (sorted, unique). - * - * Both input vectors must be sorted. Output is sorted and deduplicated. - */ -inline std::vector -merge_tracked_tables(const std::vector &build_tables, - const std::vector &probe_tables) { - std::vector merged; - merged.reserve(build_tables.size() + probe_tables.size()); - - size_t i = 0, j = 0; - while (i < build_tables.size() && j < probe_tables.size()) { - if (build_tables[i] < probe_tables[j]) { - merged.push_back(build_tables[i++]); - } else if (probe_tables[j] < build_tables[i]) { - merged.push_back(probe_tables[j++]); - } else { - merged.push_back(build_tables[i++]); - j++; // Skip duplicate - } - } - while (i < build_tables.size()) - merged.push_back(build_tables[i++]); - while (j < probe_tables.size()) - merged.push_back(probe_tables[j++]); - - return merged; -} - -/** - * @brief Initializes JoinSetup with output columns; call before join execution. - * - * PageIndex construction deferred to prepare_output_columns(). - * Computes merged table IDs from build and probe inputs. - */ -inline JoinSetup -setup_join(const JoinInput &build_input, const JoinInput &probe_input, - const PlanNode &build_node, const PlanNode &probe_node, - const PlanNode &left_node, const PlanNode &right_node, - const JoinInput &left_input, const JoinInput &right_input, - const std::vector> &output_attrs, - size_t estimated_rows) { - JoinSetup setup; - - setup.results = - initialize_output_columns(output_attrs, left_node, right_node, - left_input, right_input, estimated_rows); - - // Compute merged table IDs from build and probe sides - auto build_tables = build_input.tracked_tables(); - auto probe_tables = probe_input.tracked_tables(); - setup.merged_table_ids = merge_tracked_tables(build_tables, probe_tables); - - setup.prepared = false; - - return setup; -} - -/** - * @brief Collects Column pointers for needed output columns from columnar - * input. - * - * Unused columns get nullptr to skip PageIndex construction. - */ -inline platform::ArenaVector -collect_needed_columns(const JoinInput &input, const PlanNode &node, - const platform::ArenaVector &needed, - platform::ThreadArena &arena) { - platform::ArenaVector columns(arena); - columns.resize(node.output_attrs.size()); - std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); - auto *table = std::get(input.data); - - for (size_t i = 0; i < node.output_attrs.size(); ++i) { - auto [actual_col_idx, _] = node.output_attrs[i]; - columns[i] = needed[i] ? &table->columns[actual_col_idx] : nullptr; - } - return columns; -} - -/** - * @brief Prepares ColumnarReader with columns needed for materialization. - * - * Triggers lazy PageIndex construction only for projected columns. - */ -inline void prepare_output_columns( - ColumnarReader &reader, const JoinInput &build_input, - const JoinInput &probe_input, const PlanNode &build_node, - const PlanNode &probe_node, - const std::vector> &remapped_attrs, - size_t build_size) { - - bool build_is_columnar = build_input.is_columnar(); - bool probe_is_columnar = probe_input.is_columnar(); - - if (!build_is_columnar && !probe_is_columnar) - return; - - auto &arena = Contest::platform::get_arena(0); - - platform::ArenaVector build_needed(arena); - build_needed.resize(build_node.output_attrs.size()); - std::memset(build_needed.data(), 0, build_needed.size()); - - platform::ArenaVector probe_needed(arena); - probe_needed.resize(probe_node.output_attrs.size()); - std::memset(probe_needed.data(), 0, probe_needed.size()); - - for (const auto &[col_idx, dtype] : remapped_attrs) { - if (col_idx < build_size) { - if (build_is_columnar) { - build_needed[col_idx] = 1; - } - } else if (probe_is_columnar) { - probe_needed[col_idx - build_size] = 1; - } - } - - if (build_is_columnar) { - reader.prepare_build(collect_needed_columns(build_input, build_node, - build_needed, arena)); - } - - if (probe_is_columnar) { - reader.prepare_probe(collect_needed_columns(probe_input, probe_node, - probe_needed, arena)); - } -} - } // namespace Contest::join diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h index 8546854..c99a8be 100644 --- a/include/join_execution/nested_loop.h +++ b/include/join_execution/nested_loop.h @@ -13,10 +13,8 @@ #include #include -#include #include #include -#include #include #include #include @@ -29,9 +27,6 @@ */ namespace Contest::join { -using Contest::ExtendedResult; - -using Contest::ExecuteResult; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; @@ -39,6 +34,8 @@ using Contest::platform::worker_pool; * @brief Iterates over non-NULL values in a join input column. * * Abstracts columnar vs intermediate input. Handles NULL bitmaps. + * For IntermediateResult, reads from materialized columns (join keys are + * always materialized). * * @tparam Func void(uint32_t row_id, int32_t value). */ @@ -72,57 +69,7 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx, } } } else { - const auto &res = std::get(input.data); - const mema::column_t &col = res.columns[attr_idx]; - size_t count = col.row_count(); - for (size_t i = 0; i < count; i++) { - const mema::value_t &val = col[i]; - if (!val.is_null()) { - visitor(static_cast(i), val.value); - } - } - } -} - -/** - * @brief Iterates over non-NULL values in a deferred input column. - * - * Abstracts columnar vs DeferredResult input. For DeferredResult, reads from - * materialized columns (join keys are always materialized). - * - * @tparam Func void(uint32_t row_id, int32_t value). - */ -template -inline void visit_deferred_rows(const DeferredInput &input, size_t attr_idx, - Func &&visitor) { - if (input.is_columnar()) { - auto *table = std::get(input.data); - auto [col_idx, _] = input.node->output_attrs[attr_idx]; - const Column &col = table->columns[col_idx]; - - uint32_t row_id = 0; - for (auto *page_obj : col.pages) { - auto *page = page_obj->data; - auto num_rows = *reinterpret_cast(page); - auto num_values = *reinterpret_cast(page + 2); - auto *data = reinterpret_cast(page + 4); - - uint16_t val_idx = 0; - for (uint16_t i = 0; i < num_rows; i++) { - if (num_rows == num_values) { - visitor(row_id++, data[i]); - } else { - auto *bitmap = reinterpret_cast( - page + PAGE_SIZE - (num_rows + 7) / 8); - if (bitmap[i / 8] & (1u << (i % 8))) { - visitor(row_id, data[val_idx++]); - } - row_id++; - } - } - } - } else { - const auto &res = std::get(input.data); + const auto &res = std::get(input.data); // Join key must be materialized const mema::column_t *col = res.get_materialized(attr_idx); if (!col) @@ -180,161 +127,6 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, b_vals[i] = INT32_MIN; } - const Column *probe_col = nullptr; - platform::ArenaVector page_offsets( - Contest::platform::get_arena(0)); - if (probe_input.is_columnar()) { - auto *table = std::get(probe_input.data); - auto [col_idx, _] = probe_input.node->output_attrs[probe_attr]; - probe_col = &table->columns[col_idx]; - - page_offsets.reserve(probe_col->pages.size() + 1); - uint32_t current = 0; - for (auto *p : probe_col->pages) { - page_offsets.push_back(current); - current += *reinterpret_cast(p->data); - } - page_offsets.push_back(current); - } - std::atomic probe_page_counter{0}; - - worker_pool().execute([&](size_t t_id) { - buffers[t_id] = - ThreadLocalMatchBuffer(Contest::platform::get_arena(t_id)); - auto &local_buffer = buffers[t_id]; - - auto process_value = [&](uint32_t p_id, int32_t p_val) { - simd::eq_scan_build(p_id, p_val, b_vals, b_ids, b_count, - local_buffer); - }; - - if (probe_input.is_columnar()) { - size_t num_pages = probe_col->pages.size(); - - while (true) { - size_t i = - probe_page_counter.fetch_add(1, std::memory_order_relaxed); - - if (i >= num_pages) - break; - auto *page = probe_col->pages[i]->data; - auto num_rows = *reinterpret_cast(page); - auto num_values = *reinterpret_cast(page + 2); - auto *data = reinterpret_cast(page + 4); - uint32_t row_id = page_offsets[i]; - - if (num_rows == num_values) { - // SIMD batch: process multiple probe values at a time - uint16_t j = simd::eq_batch_columnar( - data, num_rows, row_id, b_vals, b_ids, b_count, - local_buffer); - row_id += j; - // Handle remaining elements with scalar - for (; j < num_rows; j++) { - process_value(row_id++, data[j]); - } - } else { - auto *bitmap = reinterpret_cast( - page + PAGE_SIZE - (num_rows + 7) / 8); - uint16_t val_idx = 0; - for (uint16_t j = 0; j < num_rows; j++) { - if (bitmap[j / 8] & (1u << (j % 8))) { - process_value(row_id, data[val_idx++]); - } - row_id++; - } - } - } - } else { - const auto &res = std::get(probe_input.data); - const mema::column_t &col = res.columns[probe_attr]; - size_t count = col.row_count(); - size_t start = (t_id * count) / THREAD_COUNT; - size_t end = ((t_id + 1) * count) / THREAD_COUNT; - - constexpr size_t BATCH_SIZE = simd::INTERMEDIATE_BATCH_SIZE; - size_t i = start; - - if constexpr (BATCH_SIZE > 0) { - // SIMD batch processing - for (; i + BATCH_SIZE <= end; i += BATCH_SIZE) { - size_t page_idx = i >> 12; - size_t offset = i & 0xFFF; - - // Only use SIMD if all values are on same page - if (offset + BATCH_SIZE <= mema::CAP_PER_PAGE) { - const int32_t *vals = reinterpret_cast( - &col.pages[page_idx]->data[offset]); - simd::eq_batch_intermediate( - vals, i, b_vals, b_ids, b_count, local_buffer); - } else { - // Cross-page boundary: fall back to scalar - for (size_t j = i; j < i + BATCH_SIZE; j++) { - const mema::value_t &val = col[j]; - if (!val.is_null()) { - process_value(static_cast(j), - val.value); - } - } - } - } - } - - // Handle remaining elements (or all elements if no SIMD) - for (; i < end; i++) { - const mema::value_t &val = col[i]; - if (!val.is_null()) { - process_value(static_cast(i), val.value); - } - } - } - }); - - return buffers; -} - -/** - * @brief Nested loop join for deferred execution path. - * - * Same algorithm as nested_loop_join but works with DeferredInput. - * Supports both columnar and DeferredResult inputs. - * - * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY). - * @return Thread-local match buffers for direct iteration. - */ -template -inline std::vector> -nested_loop_join_deferred(const DeferredInput &build_input, - const DeferredInput &probe_input, size_t build_attr, - size_t probe_attr) { - size_t build_rows = build_input.row_count(build_attr); - size_t probe_rows = probe_input.row_count(probe_attr); - - if (build_rows == 0 || probe_rows == 0) - return {}; - - size_t num_threads = THREAD_COUNT; - std::vector> buffers(num_threads); - - constexpr size_t MAX_BUILD_SIZE = 8; - alignas(32) int32_t b_vals[MAX_BUILD_SIZE]; - alignas(16) uint32_t b_ids[MAX_BUILD_SIZE]; - size_t b_count = 0; - - auto collect_build = [&](uint32_t id, int32_t val) { - if (b_count < MAX_BUILD_SIZE) { - b_ids[b_count] = id; - b_vals[b_count] = val; - b_count++; - } - }; - - visit_deferred_rows(build_input, build_attr, collect_build); - - for (size_t i = b_count; i < MAX_BUILD_SIZE; ++i) { - b_vals[i] = INT32_MIN; - } - // Setup for columnar probe (page-based parallel processing) const Column *probe_col = nullptr; platform::ArenaVector page_offsets( @@ -353,10 +145,10 @@ nested_loop_join_deferred(const DeferredInput &build_input, page_offsets.push_back(current); } - // Setup for DeferredResult probe + // Setup for IntermediateResult probe const mema::column_t *probe_mat_col = nullptr; if (!probe_input.is_columnar()) { - const auto &res = std::get(probe_input.data); + const auto &res = std::get(probe_input.data); probe_mat_col = res.get_materialized(probe_attr); if (!probe_mat_col) return {}; // Join key not materialized - should not happen @@ -412,7 +204,7 @@ nested_loop_join_deferred(const DeferredInput &build_input, } } } else { - // DeferredResult probe - use materialized column + // IntermediateResult probe - use materialized column const mema::column_t &col = *probe_mat_col; size_t count = col.row_count(); size_t start = (t_id * count) / THREAD_COUNT; diff --git a/include/materialization/construct_deferred.h b/include/materialization/construct_deferred.h deleted file mode 100644 index bb9a425..0000000 --- a/include/materialization/construct_deferred.h +++ /dev/null @@ -1,649 +0,0 @@ -/** - * @file construct_deferred.h - * @brief Constructs deferred intermediate results for multi-way joins. - * - * Allocates and populates DeferredResult with only MATERIALIZE columns - * (typically just the parent's join key). Deferred columns store 64-bit - * provenance (table_id, column_idx, row_id) for resolution at final output. - * - * Optimized with: - * - Column-major iteration for cache locality - * - Precomputed source metadata to avoid per-row variant access - * - SIMD provenance encoding (AVX2/NEON) for deferred columns - * - Batch access to match collector chunks - * - * @see construct_intermediate.h for the eager materialization equivalent. - * @see materialize_deferred.h for final resolution of deferred columns. - */ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(__x86_64__) -#include -#elif defined(__aarch64__) -#include -#endif - -namespace Contest { -namespace materialize { - -using Contest::io::ColumnarReader; -using Contest::join::MatchCollectionMode; -using Contest::join::ThreadLocalMatchBuffer; -using Contest::platform::THREAD_COUNT; -using Contest::platform::worker_pool; - -// ============================================================================ -// SIMD Provenance Encoding -// ============================================================================ - -namespace simd_provenance { - -#if defined(__x86_64__) && defined(__AVX2__) -inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit) -#elif defined(__aarch64__) -inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit) -#else -inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available -#endif - -/** - * @brief Encode provenance for batch of row IDs using SIMD. - * - * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row. - * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback. - * - * @param dest Destination deferred column - * @param start_idx Starting output index - * @param row_ids Pointer to row IDs (from IndexChunk, contiguous) - * @param count Number of row IDs to process - * @param table_id Base table ID (constant for all rows) - * @param column_idx Base column index (constant for all rows) - * @return Number of rows processed (always == count) - */ -inline size_t encode_provenance_batch(mema::deferred_column_t &dest, - size_t start_idx, const uint32_t *row_ids, - size_t count, uint8_t table_id, - uint8_t column_idx) { - // Precompute constant prefix: (table_id << 56) | (column_idx << 48) - const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0); - - size_t i = 0; - -#if defined(__x86_64__) && defined(__AVX2__) - // AVX2: Process 4 x uint64_t at a time - // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix - const __m256i prefix_vec = _mm256_set1_epi64x(static_cast(prefix)); - - for (; i + 4 <= count; i += 4) { - // Load 4 x uint32_t and zero-extend to 4 x uint64_t - __m128i rows_32 = - _mm_loadu_si128(reinterpret_cast(row_ids + i)); - __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32); - - // OR with prefix to create provenance values - __m256i result = _mm256_or_si256(rows_64, prefix_vec); - - // Store to aligned buffer, then write individually (page-safe) - alignas(32) uint64_t out[4]; - _mm256_store_si256(reinterpret_cast<__m256i *>(out), result); - - dest.write_at(start_idx + i, out[0]); - dest.write_at(start_idx + i + 1, out[1]); - dest.write_at(start_idx + i + 2, out[2]); - dest.write_at(start_idx + i + 3, out[3]); - } -#elif defined(__aarch64__) - // NEON: Process 2 x uint64_t at a time - const uint64x2_t prefix_vec = vdupq_n_u64(prefix); - - for (; i + 2 <= count; i += 2) { - // Load 2 x uint32_t and zero-extend to 2 x uint64_t - uint32x2_t rows_32 = vld1_u32(row_ids + i); - uint64x2_t rows_64 = vmovl_u32(rows_32); - - // OR with prefix - uint64x2_t result = vorrq_u64(rows_64, prefix_vec); - - // Store individually (page boundary safe) - dest.write_at(start_idx + i, vgetq_lane_u64(result, 0)); - dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1)); - } -#endif - - // Scalar remainder - for (; i < count; ++i) { - dest.write_at(start_idx + i, - prefix | static_cast(row_ids[i])); - } - - return count; -} - -/** - * @brief Copy provenance from source column using batch reads. - * - * Copies existing 64-bit provenance values from child intermediate. - * Uses contiguous batch access for better cache behavior. - * - * @param dest Destination deferred column - * @param start_idx Starting output index - * @param src Source deferred column (from child) - * @param row_ids Row indices into source column - * @param count Number of rows to copy - * @return Number of rows processed (always == count) - */ -inline size_t copy_provenance_batch(mema::deferred_column_t &dest, - size_t start_idx, - const mema::deferred_column_t &src, - const uint32_t *row_ids, size_t count) { - for (size_t i = 0; i < count; ++i) { - dest.write_at(start_idx + i, src[row_ids[i]]); - } - return count; -} - -} // namespace simd_provenance - -// ============================================================================ -// Source Precomputation Structures -// ============================================================================ - -/** - * @brief Precomputed metadata for deferred column sources. - * - * Tracks where each deferred column's provenance comes from: - * - For columnar inputs: encode fresh (table_id, column_idx, row_id) - * - For DeferredResult inputs: copy existing provenance from child - */ -struct DeferredColumnSource { - const mema::deferred_column_t *source_col = - nullptr; ///< Source if from intermediate. - uint8_t base_table_id = 0; ///< Base table ID for encoding. - uint8_t base_column_idx = 0; ///< Base column index for encoding. - bool from_build = false; ///< True if from build side. - bool needs_encode = false; ///< True if columnar (needs fresh encode). -}; - -/** - * @brief Precomputed metadata for materialized column sources. - * - * Eliminates per-row std::variant access and conditional checks in hot loop. - * Mirrors SourceInfo from construct_intermediate.h but for deferred path. - */ -struct alignas(8) MaterializedColumnSource { - const mema::column_t *intermediate_col = - nullptr; ///< Source if from DeferredResult materialized - const Column *columnar_col = nullptr; ///< Source if from ColumnarTable - const mema::deferred_column_t *deferred_resolve_col = - nullptr; ///< Source if needs deferred resolution - size_t child_output_idx = 0; ///< Index in child's output - size_t mat_col_idx = 0; ///< Index in result.materialized[] - DataType type = DataType::INT32; - uint8_t base_table_id = 0; ///< For VARCHAR source tracking - uint8_t base_column_idx = 0; ///< For VARCHAR source tracking - bool is_columnar = false; ///< True if source is ColumnarTable - bool from_build = false; ///< True if from build side - bool needs_deferred_resolve = false; ///< True if child deferred this column -}; - -// ============================================================================ -// Helper Functions -// ============================================================================ - -/** - * @brief Collect columns needed from a DeferredInput for page index building. - */ -inline platform::ArenaVector -collect_deferred_columns(const DeferredInput &input, - const platform::ArenaVector &needed, - platform::ThreadArena &arena) { - platform::ArenaVector columns(arena); - if (!input.node) - return columns; - - columns.resize(input.node->output_attrs.size()); - std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); - - if (!input.is_columnar()) - return columns; - - auto *table = std::get(input.data); - for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { - if (i < needed.size() && needed[i]) { - auto [actual_col_idx, _] = input.node->output_attrs[i]; - columns[i] = &table->columns[actual_col_idx]; - } - } - return columns; -} - -/** - * @brief Prepare ColumnarReader for deferred materialization path. - * - * Sets up page indices for columns that need to be read from columnar inputs. - */ -inline void prepare_deferred_columns( - ColumnarReader &reader, const DeferredInput &build_input, - const DeferredInput &probe_input, const DeferredJoinNode &join_node, - const std::vector> &remapped_attrs, - size_t build_size, bool build_is_left) { - - bool build_is_columnar = build_input.is_columnar(); - bool probe_is_columnar = probe_input.is_columnar(); - - if (!build_is_columnar && !probe_is_columnar) - return; - - auto &arena = Contest::platform::get_arena(0); - - // Determine which columns from each side are needed - platform::ArenaVector build_needed(arena); - if (build_input.node) { - build_needed.resize(build_input.node->output_attrs.size()); - std::memset(build_needed.data(), 0, build_needed.size()); - } - - platform::ArenaVector probe_needed(arena); - if (probe_input.node) { - probe_needed.resize(probe_input.node->output_attrs.size()); - std::memset(probe_needed.data(), 0, probe_needed.size()); - } - - // Mark columns needed based on materialization decisions - for (const auto &col : join_node.columns) { - if (col.resolution == ColumnResolution::MATERIALIZE) { - bool from_build = (col.from_left == build_is_left); - if (from_build && col.child_output_idx < build_needed.size()) { - build_needed[col.child_output_idx] = 1; - } else if (!from_build && - col.child_output_idx < probe_needed.size()) { - probe_needed[col.child_output_idx] = 1; - } - } - } - - if (build_is_columnar) { - reader.prepare_build( - collect_deferred_columns(build_input, build_needed, arena)); - } - - if (probe_is_columnar) { - reader.prepare_probe( - collect_deferred_columns(probe_input, probe_needed, arena)); - } -} - -/** - * @brief Create empty deferred result with proper schema. - */ -inline DeferredResult -create_empty_deferred_result(const DeferredJoinNode &node) { - DeferredResult result; - result.node_info = &node; - result.num_rows = 0; - result.materialized_map.resize(node.columns.size(), std::nullopt); - result.deferred_map.resize(node.columns.size(), std::nullopt); - - size_t mat_count = 0; - size_t def_count = 0; - for (const auto &col : node.columns) { - if (col.resolution == ColumnResolution::MATERIALIZE) { - result.materialized_map[col.original_idx] = mat_count++; - } else { - result.deferred_map[col.original_idx] = def_count++; - } - } - result.materialized.resize(mat_count); - result.deferred_columns.resize(def_count); - - return result; -} - -/** - * @brief Prepare deferred column sources for intermediate construction. - */ -inline std::vector -prepare_deferred_sources(const DeferredJoinNode &join_node, - const DeferredInput &build_input, - const DeferredInput &probe_input, bool build_is_left) { - std::vector sources; - sources.reserve(join_node.num_deferred_columns); - - for (const auto &col : join_node.columns) { - if (col.resolution != ColumnResolution::DEFER) - continue; - - DeferredColumnSource src; - src.base_table_id = col.provenance.base_table_id; - src.base_column_idx = col.provenance.base_column_idx; - src.from_build = (col.from_left == build_is_left); - - const auto &src_input = src.from_build ? build_input : probe_input; - - if (src_input.is_columnar()) { - src.needs_encode = true; - src.source_col = nullptr; - } else { - const auto *child_def = - src_input.get_deferred_column(col.child_output_idx); - if (child_def) { - src.needs_encode = false; - src.source_col = child_def; - } else { - src.needs_encode = true; - src.source_col = nullptr; - } - } - sources.push_back(src); - } - return sources; -} - -/** - * @brief Precompute materialized column sources for column-major iteration. - * - * For each MATERIALIZE column, determines source type and caches pointers - * to avoid per-row std::variant access in the hot loop. - */ -inline std::vector prepare_materialized_sources( - const DeferredJoinNode &join_node, const DeferredInput &build_input, - const DeferredInput &probe_input, bool build_is_left) { - std::vector sources; - sources.reserve(join_node.columns.size()); - - size_t mat_idx = 0; - for (const auto &col : join_node.columns) { - if (col.resolution != ColumnResolution::MATERIALIZE) - continue; - - MaterializedColumnSource src; - src.mat_col_idx = mat_idx++; - src.child_output_idx = col.child_output_idx; - src.type = col.type; - src.base_table_id = col.provenance.base_table_id; - src.base_column_idx = col.provenance.base_column_idx; - src.from_build = (col.from_left == build_is_left); - - const auto &src_input = src.from_build ? build_input : probe_input; - - if (src_input.is_columnar()) { - src.is_columnar = true; - const auto *table = std::get(src_input.data); - auto [actual_idx, _] = - src_input.node->output_attrs[col.child_output_idx]; - src.columnar_col = &table->columns[actual_idx]; - } else { - src.is_columnar = false; - const auto &ir = std::get(src_input.data); - - if (ir.is_materialized(col.child_output_idx)) { - src.intermediate_col = - ir.get_materialized(col.child_output_idx); - } else if (ir.is_deferred(col.child_output_idx)) { - src.needs_deferred_resolve = true; - src.deferred_resolve_col = - ir.get_deferred(col.child_output_idx); - } - } - sources.push_back(src); - } - - return sources; -} - -// ============================================================================ -// Main Construction Function -// ============================================================================ - -/** - * @brief Constructs deferred intermediate result from thread-local buffers. - * - * Optimized with column-major iteration and SIMD provenance encoding. - * Only materializes columns marked MATERIALIZE in the DeferredJoinNode. - * Deferred columns store 64-bit provenance encoding for resolution at final - * output. - * - * @tparam Mode Collection mode for compile-time specialization. - * @param buffers Thread-local match buffers from probe. - * @param build_input Build side data source. - * @param probe_input Probe side data source. - * @param join_node Deferred join node with materialization decisions. - * @param remapped_attrs Output attributes (after build/probe remapping). - * @param build_output_size Number of columns from build side. - * @param build_is_left True if build side is the original left child. - * @param columnar_reader Reader for columnar data access. - * @param out_result Output DeferredResult (populated in-place). - * @param deferred_plan Full deferred plan for base table access. - */ -template -void construct_deferred_from_buffers( - std::vector> &buffers, - const DeferredInput &build_input, const DeferredInput &probe_input, - const DeferredJoinNode &join_node, - const std::vector> &remapped_attrs, - size_t build_output_size, bool build_is_left, - ColumnarReader &columnar_reader, DeferredResult &out_result, - const DeferredPlan &deferred_plan) { - - // Count total matches and compute buffer start offsets - size_t total_matches = 0; - std::vector buffer_starts(buffers.size()); - for (size_t i = 0; i < buffers.size(); ++i) { - buffer_starts[i] = total_matches; - total_matches += buffers[i].count(); - } - - if (total_matches == 0) { - out_result = create_empty_deferred_result(join_node); - return; - } - - // Initialize result metadata - out_result.node_info = &join_node; - out_result.num_rows = total_matches; - out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); - out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); - - size_t mat_count = 0; - size_t def_count = 0; - for (const auto &col : join_node.columns) { - if (col.resolution == ColumnResolution::MATERIALIZE) { - out_result.materialized_map[col.original_idx] = mat_count++; - } else { - out_result.deferred_map[col.original_idx] = def_count++; - } - } - - // Precompute sources for column-major iteration - auto mat_sources = prepare_materialized_sources(join_node, build_input, - probe_input, build_is_left); - auto deferred_sources = prepare_deferred_sources( - join_node, build_input, probe_input, build_is_left); - - // Pre-allocate pages - using Page = mema::column_t::Page; - using DeferredPage = mema::deferred_column_t::Page; - size_t mat_pages_needed = - (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; - size_t def_pages_needed = - (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) / - mema::deferred_column_t::ENTRIES_PER_PAGE; - - out_result.materialized.resize(mat_count); - for (size_t c = 0; c < mat_count; ++c) { - out_result.materialized[c].pages.resize(mat_pages_needed); - out_result.materialized[c].set_row_count(total_matches); - } - - out_result.deferred_columns.resize(def_count); - for (size_t d = 0; d < def_count; ++d) { - out_result.deferred_columns[d].pages.resize(def_pages_needed); - out_result.deferred_columns[d].set_row_count(total_matches); - } - - // Set source metadata for materialized columns - for (const auto &src : mat_sources) { - out_result.materialized[src.mat_col_idx].source_table = - src.base_table_id; - out_result.materialized[src.mat_col_idx].source_column = - src.base_column_idx; - } - - const size_t num_threads = THREAD_COUNT; - - // Parallel page allocation - worker_pool().execute([&](size_t t) { - for (size_t c = 0; c < mat_count; ++c) { - auto &col = out_result.materialized[c]; - for (size_t p = t; p < mat_pages_needed; p += num_threads) { - void *ptr = - Contest::platform::get_arena(t) - .alloc_chunk(); - col.pages[p] = reinterpret_cast(ptr); - } - } - for (size_t d = 0; d < def_count; ++d) { - auto &def_col = out_result.deferred_columns[d]; - for (size_t p = t; p < def_pages_needed; p += num_threads) { - void *ptr = - Contest::platform::get_arena(t) - .alloc_chunk< - Contest::platform::ChunkType::DEFERRED_PAGE>(); - def_col.pages[p] = reinterpret_cast(ptr); - } - } - }); - - // ======================================================================== - // COLUMN-MAJOR PARALLEL POPULATION - // ======================================================================== - worker_pool().execute([&](size_t t) { - if (t >= buffers.size()) - return; - auto &buf = buffers[t]; - size_t my_count = buf.count(); - if (my_count == 0) - return; - - size_t start = buffer_starts[t]; - ColumnarReader::Cursor cursor; - - // ==================================================================== - // Process MATERIALIZED columns (column-major for cache locality) - // ==================================================================== - for (const auto &src : mat_sources) { - auto &dest_col = out_result.materialized[src.mat_col_idx]; - - // Get appropriate range based on which side this column comes from - auto range = src.from_build ? buf.left_range() : buf.right_range(); - - if (src.is_columnar) { - // Columnar source - use ColumnarReader with cursor caching - const auto &col = *src.columnar_col; - size_t k = start; - for (uint32_t rid : range) { - dest_col.write_at(k++, - columnar_reader.read_value( - col, src.child_output_idx, rid, - src.type, cursor, src.from_build)); - } - } else if (src.intermediate_col) { - // Intermediate materialized source - direct copy - const auto &vec = *src.intermediate_col; - size_t k = start; - for (uint32_t rid : range) { - dest_col.write_at(k++, vec[rid]); - } - } else if (src.needs_deferred_resolve && src.deferred_resolve_col) { - // Deferred in child - resolve via provenance - const auto &def_col = *src.deferred_resolve_col; - size_t k = start; - for (uint32_t rid : range) { - uint64_t prov = def_col[rid]; - uint8_t base_tid = DeferredProvenance::table(prov); - uint8_t base_col = DeferredProvenance::column(prov); - uint64_t base_row = DeferredProvenance::row(prov); - - if (deferred_plan.original_plan) [[likely]] { - const auto &base_table = - deferred_plan.original_plan->inputs[base_tid]; - mema::value_t val = - columnar_reader.read_value_direct_public( - base_table.columns[base_col], - static_cast(base_row), src.type); - dest_col.write_at(k++, val); - } else { - dest_col.write_at( - k++, mema::value_t{mema::value_t::NULL_VALUE}); - } - } - } - } - - // ==================================================================== - // Process DEFERRED columns (column-major with SIMD batch encoding) - // ==================================================================== - for (size_t d = 0; d < deferred_sources.size(); ++d) { - const auto &def_src = deferred_sources[d]; - auto &dest_def_col = out_result.deferred_columns[d]; - - if (def_src.needs_encode) { - // Fresh encoding from columnar input - use SIMD batch - auto batch_reader = def_src.from_build - ? buf.left_batch_reader() - : buf.right_batch_reader(); - - size_t k = start; - while (batch_reader.has_more()) { - size_t batch_count; - // Request larger batches for SIMD efficiency - constexpr size_t MAX_BATCH = - simd_provenance::BATCH_SIZE > 0 ? 64 : 256; - const uint32_t *row_ids = - batch_reader.get_batch(MAX_BATCH, batch_count); - - if (batch_count > 0) { - simd_provenance::encode_provenance_batch( - dest_def_col, k, row_ids, batch_count, - def_src.base_table_id, def_src.base_column_idx); - k += batch_count; - } - } - } else if (def_src.source_col) { - // Copy existing provenance from child intermediate - auto batch_reader = def_src.from_build - ? buf.left_batch_reader() - : buf.right_batch_reader(); - - size_t k = start; - while (batch_reader.has_more()) { - size_t batch_count; - const uint32_t *row_ids = - batch_reader.get_batch(256, batch_count); - - if (batch_count > 0) { - simd_provenance::copy_provenance_batch( - dest_def_col, k, *def_src.source_col, row_ids, - batch_count); - k += batch_count; - } - } - } - } - }); -} - -} // namespace materialize -} // namespace Contest diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index 090863f..ed1834d 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -2,156 +2,345 @@ * @file construct_intermediate.h * @brief Constructs intermediate results for multi-way joins. * - * Allocates and populates ExecuteResult (column_t) from match collectors. - * Templated on MatchCollectionMode for zero-overhead mode selection. + * Allocates and populates IntermediateResult with only MATERIALIZE columns + * (typically just the parent's join key). Deferred columns store 64-bit + * provenance (table_id, column_idx, row_id) for resolution at final output. + * + * Optimized with: + * - Column-major iteration for cache locality + * - Precomputed source metadata to avoid per-row variant access + * - SIMD provenance encoding (AVX2/NEON) for deferred columns + * - Batch access to match collector chunks + * + * @see materialize.h for final resolution of deferred columns. */ #pragma once +#include +#include + #include +#include #include -#include -#include +#include #include #include #include -#include -/** - * @namespace Contest::materialize - * @brief Materialization of join results into columnar format. - * - * @see intermediate.h for column_t/value_t format details. - */ -namespace Contest::materialize { -using Contest::ExecuteResult; -using Contest::ExtendedResult; -using Contest::GlobalRowId; +#if defined(__x86_64__) +#include +#elif defined(__aarch64__) +#include +#endif + +namespace Contest { +namespace materialize { + using Contest::io::ColumnarReader; -using Contest::join::JoinInput; using Contest::join::MatchCollectionMode; using Contest::join::ThreadLocalMatchBuffer; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; +// ============================================================================ +// SIMD Provenance Encoding +// ============================================================================ + +namespace simd_provenance { + +#if defined(__x86_64__) && defined(__AVX2__) +inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit) +#elif defined(__aarch64__) +inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit) +#else +inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available +#endif + /** - * @brief Precomputed metadata for resolving an output column's source. + * @brief Encode provenance for batch of row IDs using SIMD. * - * Avoids per-value std::variant accesses and tuple lookups in hot loop. - * 8-byte alignment optimizes struct packing for vector iteration. + * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row. + * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback. * - * @see prepare_sources() for precomputation logic. + * @param dest Destination deferred column + * @param start_idx Starting output index + * @param row_ids Pointer to row IDs (from IndexChunk, contiguous) + * @param count Number of row IDs to process + * @param table_id Base table ID (constant for all rows) + * @param column_idx Base column index (constant for all rows) + * @return Number of rows processed (always == count) */ -struct alignas(8) SourceInfo { - const mema::column_t *intermediate_col = - nullptr; /**< Source if intermediate. */ - const Column *columnar_col = nullptr; /**< Source if columnar. */ - size_t remapped_col_idx = 0; /**< Local index within source side. */ - bool is_columnar = false; /**< True if source is columnar table. */ - bool from_build = false; /**< True if from build side, false if probe. */ -}; +inline size_t encode_provenance_batch(mema::deferred_column_t &dest, + size_t start_idx, const uint32_t *row_ids, + size_t count, uint8_t table_id, + uint8_t column_idx) { + // Precompute constant prefix: (table_id << 56) | (column_idx << 48) + const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0); + + size_t i = 0; + +#if defined(__x86_64__) && defined(__AVX2__) + // AVX2: Process 4 x uint64_t at a time + // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix + const __m256i prefix_vec = _mm256_set1_epi64x(static_cast(prefix)); + + for (; i + 4 <= count; i += 4) { + // Load 4 x uint32_t and zero-extend to 4 x uint64_t + __m128i rows_32 = + _mm_loadu_si128(reinterpret_cast(row_ids + i)); + __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32); + + // OR with prefix to create provenance values + __m256i result = _mm256_or_si256(rows_64, prefix_vec); + + // Store to aligned buffer, then write individually (page-safe) + alignas(32) uint64_t out[4]; + _mm256_store_si256(reinterpret_cast<__m256i *>(out), result); + + dest.write_at(start_idx + i, out[0]); + dest.write_at(start_idx + i + 1, out[1]); + dest.write_at(start_idx + i + 2, out[2]); + dest.write_at(start_idx + i + 3, out[3]); + } +#elif defined(__aarch64__) + // NEON: Process 2 x uint64_t at a time + const uint64x2_t prefix_vec = vdupq_n_u64(prefix); + + for (; i + 2 <= count; i += 2) { + // Load 2 x uint32_t and zero-extend to 2 x uint64_t + uint32x2_t rows_32 = vld1_u32(row_ids + i); + uint64x2_t rows_64 = vmovl_u32(rows_32); + + // OR with prefix + uint64x2_t result = vorrq_u64(rows_64, prefix_vec); + + // Store individually (page boundary safe) + dest.write_at(start_idx + i, vgetq_lane_u64(result, 0)); + dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1)); + } +#endif + + // Scalar remainder + for (; i < count; ++i) { + dest.write_at(start_idx + i, + prefix | static_cast(row_ids[i])); + } + + return count; +} /** - * @brief Builds SourceInfo for each output column for fast hot-loop lookup. + * @brief Copy provenance from source column using batch reads. * - * @param remapped_attrs Output column specifications (global indexing). - * @param build_input Build side data (ColumnarTable* or ExtendedResult). - * @param probe_input Probe side data (ColumnarTable* or ExtendedResult). - * @param build_node PlanNode for build side (contains output_attrs). - * @param probe_node PlanNode for probe side (contains output_attrs). - * @param build_size Number of columns from build side. - * @return Vector of SourceInfo, one per output column. + * Copies existing 64-bit provenance values from child intermediate. + * Uses contiguous batch access for better cache behavior. * - * @see SourceInfo for field documentation. - * @see construct_intermediate() for consumption in hot loop. + * @param dest Destination deferred column + * @param start_idx Starting output index + * @param src Source deferred column (from child) + * @param row_ids Row indices into source column + * @param count Number of rows to copy + * @return Number of rows processed (always == count) */ -inline std::vector -prepare_sources(const std::vector> &remapped_attrs, - const JoinInput &build_input, const JoinInput &probe_input, - const PlanNode &build_node, const PlanNode &probe_node, - size_t build_size) { - std::vector sources; - sources.reserve(remapped_attrs.size()); - for (const auto &[col_idx, _] : remapped_attrs) { - SourceInfo info; - info.from_build = (col_idx < build_size); - size_t local_idx = info.from_build ? col_idx : col_idx - build_size; - info.remapped_col_idx = local_idx; - const JoinInput &input = info.from_build ? build_input : probe_input; - const PlanNode &node = info.from_build ? build_node : probe_node; - if (input.is_columnar()) { - info.is_columnar = true; - auto *table = std::get(input.data); - auto [actual_idx, _] = node.output_attrs[local_idx]; - info.columnar_col = &table->columns[actual_idx]; - } else { - info.is_columnar = false; - const auto &res = std::get(input.data); - info.intermediate_col = &res.columns[local_idx]; - } - sources.push_back(info); +inline size_t copy_provenance_batch(mema::deferred_column_t &dest, + size_t start_idx, + const mema::deferred_column_t &src, + const uint32_t *row_ids, size_t count) { + for (size_t i = 0; i < count; ++i) { + dest.write_at(start_idx + i, src[row_ids[i]]); } - return sources; + return count; } +} // namespace simd_provenance + +// ============================================================================ +// Source Precomputation Structures +// ============================================================================ + /** - * @brief Precomputed metadata for resolving a row ID column's source. + * @brief Precomputed metadata for deferred column sources. * - * Determines how to populate each output row ID column: - * - For columnar input: encode GlobalRowId on-the-fly from local index - * - For intermediate input: copy from existing rowid_column_t + * Tracks where each deferred column's provenance comes from: + * - For columnar inputs: encode fresh (table_id, column_idx, row_id) + * - For IntermediateResult inputs: copy existing provenance from child + */ +struct DeferredColumnSource { + const mema::deferred_column_t *source_col = + nullptr; ///< Source if from intermediate. + uint8_t base_table_id = 0; ///< Base table ID for encoding. + uint8_t base_column_idx = 0; ///< Base column index for encoding. + bool from_build = false; ///< True if from build side. + bool needs_encode = false; ///< True if columnar (needs fresh encode). +}; + +/** + * @brief Precomputed metadata for materialized column sources. * - * @see prepare_rowid_sources() for precomputation logic. + * Eliminates per-row std::variant access and conditional checks in hot loop. */ -struct alignas(8) RowIdSource { - const mema::rowid_column_t *source_col = - nullptr; /**< Source if from intermediate (else encode). */ - uint8_t table_id = 0; /**< Table ID for encoding/lookup. */ - bool from_build = false; /**< True if from build side, false if probe. */ - bool needs_encode = - false; /**< True if columnar (needs GlobalRowId encode). */ +struct alignas(8) MaterializedColumnSource { + const mema::column_t *intermediate_col = + nullptr; ///< Source if from IntermediateResult materialized + const Column *columnar_col = nullptr; ///< Source if from ColumnarTable + const mema::deferred_column_t *deferred_resolve_col = + nullptr; ///< Source if needs deferred resolution + size_t child_output_idx = 0; ///< Index in child's output + size_t mat_col_idx = 0; ///< Index in result.materialized[] + DataType type = DataType::INT32; + uint8_t base_table_id = 0; ///< For VARCHAR source tracking + uint8_t base_column_idx = 0; ///< For VARCHAR source tracking + bool is_columnar = false; ///< True if source is ColumnarTable + bool from_build = false; ///< True if from build side + bool needs_deferred_resolve = false; ///< True if child deferred this column }; +// ============================================================================ +// Helper Functions +// ============================================================================ + /** - * @brief Builds RowIdSource for each output row ID column. + * @brief Collect columns needed from a JoinInput for page index building. + */ +inline platform::ArenaVector +collect_input_columns(const JoinInput &input, + const platform::ArenaVector &needed, + platform::ThreadArena &arena) { + platform::ArenaVector columns(arena); + if (!input.node) + return columns; + + columns.resize(input.node->output_attrs.size()); + std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); + + if (!input.is_columnar()) + return columns; + + auto *table = std::get(input.data); + for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { + if (i < needed.size() && needed[i]) { + auto [actual_col_idx, _] = input.node->output_attrs[i]; + columns[i] = &table->columns[actual_col_idx]; + } + } + return columns; +} + +/** + * @brief Prepare ColumnarReader for intermediate construction. * - * @param merged_table_ids Sorted, unique table IDs to track in output. - * @param build_input Build side data (ColumnarTable* or ExtendedResult). - * @param probe_input Probe side data (ColumnarTable* or ExtendedResult). - * @return Vector of RowIdSource, one per tracked table. + * Sets up page indices for columns that need to be read from columnar inputs. */ -inline std::vector -prepare_rowid_sources(const std::vector &merged_table_ids, - const JoinInput &build_input, - const JoinInput &probe_input) { - std::vector sources; - sources.reserve(merged_table_ids.size()); - - for (uint8_t tid : merged_table_ids) { - RowIdSource src; - src.table_id = tid; - - // Check build side first - auto build_tables = build_input.tracked_tables(); - bool in_build = std::find(build_tables.begin(), build_tables.end(), - tid) != build_tables.end(); - if (in_build) { - src.from_build = true; - if (build_input.is_columnar()) { - src.needs_encode = true; - src.source_col = nullptr; - } else { - src.needs_encode = false; - src.source_col = build_input.get_rowid_column(tid); +inline void prepare_intermediate_columns( + ColumnarReader &reader, const JoinInput &build_input, + const JoinInput &probe_input, const AnalyzedJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left) { + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + if (!build_is_columnar && !probe_is_columnar) + return; + + auto &arena = Contest::platform::get_arena(0); + + // Determine which columns from each side are needed + platform::ArenaVector build_needed(arena); + if (build_input.node) { + build_needed.resize(build_input.node->output_attrs.size()); + std::memset(build_needed.data(), 0, build_needed.size()); + } + + platform::ArenaVector probe_needed(arena); + if (probe_input.node) { + probe_needed.resize(probe_input.node->output_attrs.size()); + std::memset(probe_needed.data(), 0, probe_needed.size()); + } + + // Mark columns needed based on materialization decisions + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && + col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; } + } + } + + if (build_is_columnar) { + reader.prepare_build( + collect_input_columns(build_input, build_needed, arena)); + } + + if (probe_is_columnar) { + reader.prepare_probe( + collect_input_columns(probe_input, probe_needed, arena)); + } +} + +/** + * @brief Create empty intermediate result with proper schema. + */ +inline IntermediateResult +create_empty_intermediate_result(const AnalyzedJoinNode &node) { + IntermediateResult result; + result.node_info = &node; + result.num_rows = 0; + result.materialized_map.resize(node.columns.size(), std::nullopt); + result.deferred_map.resize(node.columns.size(), std::nullopt); + + size_t mat_count = 0; + size_t def_count = 0; + for (const auto &col : node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + result.materialized_map[col.original_idx] = mat_count++; + } else { + result.deferred_map[col.original_idx] = def_count++; + } + } + result.materialized.resize(mat_count); + result.deferred_columns.resize(def_count); + + return result; +} + +/** + * @brief Prepare deferred column sources for intermediate construction. + */ +inline std::vector +prepare_deferred_sources(const AnalyzedJoinNode &join_node, + const JoinInput &build_input, + const JoinInput &probe_input, bool build_is_left) { + std::vector sources; + sources.reserve(join_node.num_deferred_columns); + + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::DEFER) + continue; + + DeferredColumnSource src; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.needs_encode = true; + src.source_col = nullptr; } else { - // Must be from probe side - src.from_build = false; - if (probe_input.is_columnar()) { + const auto *child_def = + src_input.get_deferred_column(col.child_output_idx); + if (child_def) { + src.needs_encode = false; + src.source_col = child_def; + } else { src.needs_encode = true; src.source_col = nullptr; - } else { - src.needs_encode = false; - src.source_col = probe_input.get_rowid_column(tid); } } sources.push_back(src); @@ -160,34 +349,93 @@ prepare_rowid_sources(const std::vector &merged_table_ids, } /** - * @brief Constructs intermediate results directly from thread-local buffers. + * @brief Precompute materialized column sources for column-major iteration. * - * Each thread iterates its own buffer, avoiding the merge step. Total matches - * computed by summing buffer counts. Each thread writes its contiguous portion - * of output pages. Also populates row ID columns for provenance tracking. + * For each MATERIALIZE column, determines source type and caches pointers + * to avoid per-row std::variant access in the hot loop. + */ +inline std::vector +prepare_materialized_sources(const AnalyzedJoinNode &join_node, + const JoinInput &build_input, + const JoinInput &probe_input, bool build_is_left) { + std::vector sources; + sources.reserve(join_node.columns.size()); + + size_t mat_idx = 0; + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + + MaterializedColumnSource src; + src.mat_col_idx = mat_idx++; + src.child_output_idx = col.child_output_idx; + src.type = col.type; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.is_columnar = true; + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col.child_output_idx]; + src.columnar_col = &table->columns[actual_idx]; + } else { + src.is_columnar = false; + const auto &ir = std::get(src_input.data); + + if (ir.is_materialized(col.child_output_idx)) { + src.intermediate_col = + ir.get_materialized(col.child_output_idx); + } else if (ir.is_deferred(col.child_output_idx)) { + src.needs_deferred_resolve = true; + src.deferred_resolve_col = + ir.get_deferred(col.child_output_idx); + } + } + sources.push_back(src); + } + + return sources; +} + +// ============================================================================ +// Main Construction Function +// ============================================================================ + +/** + * @brief Constructs intermediate result from thread-local buffers. + * + * Optimized with column-major iteration and SIMD provenance encoding. + * Only materializes columns marked MATERIALIZE in the AnalyzedJoinNode. + * Deferred columns store 64-bit provenance encoding for resolution at final + * output. * * @tparam Mode Collection mode for compile-time specialization. - * @param buffers Vector of ThreadLocalMatchBuffer from probe. - * @param build_input Build side data (ColumnarTable* or ExtendedResult). - * @param probe_input Probe side data (ColumnarTable* or ExtendedResult). - * @param remapped_attrs Output column specifications (global indexing). - * @param build_node PlanNode for build side output_attrs mapping. - * @param probe_node PlanNode for probe side output_attrs mapping. - * @param build_size Number of output columns from build side. - * @param columnar_reader ColumnarReader with Cursor caching for page access. - * @param results Pre-initialized ExtendedResult, populated in-place. - * @param merged_table_ids Sorted, unique table IDs to track in output. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side data source. + * @param probe_input Probe side data source. + * @param join_node Analyzed join node with materialization decisions. + * @param remapped_attrs Output attributes (after build/probe remapping). + * @param build_output_size Number of columns from build side. + * @param build_is_left True if build side is the original left child. + * @param columnar_reader Reader for columnar data access. + * @param out_result Output IntermediateResult (populated in-place). + * @param analyzed_plan Full analyzed plan for base table access. */ template -inline void construct_intermediate_from_buffers( +void construct_intermediate_from_buffers( std::vector> &buffers, const JoinInput &build_input, const JoinInput &probe_input, + const AnalyzedJoinNode &join_node, const std::vector> &remapped_attrs, - const PlanNode &build_node, const PlanNode &probe_node, size_t build_size, - ColumnarReader &columnar_reader, ExtendedResult &results, - const std::vector &merged_table_ids) { + size_t build_output_size, bool build_is_left, + ColumnarReader &columnar_reader, IntermediateResult &out_result, + const AnalyzedPlan &analyzed_plan) { - // Compute total matches and per-buffer start offsets + // Count total matches and compute buffer start offsets size_t total_matches = 0; std::vector buffer_starts(buffers.size()); for (size_t i = 0; i < buffers.size(); ++i) { @@ -195,63 +443,90 @@ inline void construct_intermediate_from_buffers( total_matches += buffers[i].count(); } - if (total_matches == 0) + if (total_matches == 0) { + out_result = create_empty_intermediate_result(join_node); return; + } - auto sources = prepare_sources(remapped_attrs, build_input, probe_input, - build_node, probe_node, build_size); - auto rowid_sources = - prepare_rowid_sources(merged_table_ids, build_input, probe_input); + // Initialize result metadata + out_result.node_info = &join_node; + out_result.num_rows = total_matches; + out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); + out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); - const size_t num_threads = THREAD_COUNT; - const size_t num_cols = sources.size(); - const size_t num_rowid_cols = rowid_sources.size(); + size_t mat_count = 0; + size_t def_count = 0; + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + out_result.materialized_map[col.original_idx] = mat_count++; + } else { + out_result.deferred_map[col.original_idx] = def_count++; + } + } + + // Precompute sources for column-major iteration + auto mat_sources = prepare_materialized_sources(join_node, build_input, + probe_input, build_is_left); + auto deferred_sources = prepare_deferred_sources( + join_node, build_input, probe_input, build_is_left); - // Pre-size page vectors for each data column + // Pre-allocate pages using Page = mema::column_t::Page; - using RowIdPage = mema::rowid_column_t::Page; - size_t total_pages_needed = + using DeferredPage = mema::deferred_column_t::Page; + size_t mat_pages_needed = (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; + size_t def_pages_needed = + (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) / + mema::deferred_column_t::ENTRIES_PER_PAGE; - for (size_t c = 0; c < num_cols; ++c) { - auto &col = results.columns[c]; - col.pages.resize(total_pages_needed); - col.set_row_count(total_matches); + out_result.materialized.resize(mat_count); + for (size_t c = 0; c < mat_count; ++c) { + out_result.materialized[c].pages.resize(mat_pages_needed); + out_result.materialized[c].set_row_count(total_matches); } - // Setup row ID columns in results - results.table_ids = merged_table_ids; - results.row_ids.resize(num_rowid_cols); - for (size_t r = 0; r < num_rowid_cols; ++r) { - results.row_ids[r].table_id = merged_table_ids[r]; - results.row_ids[r].pages.resize(total_pages_needed); - results.row_ids[r].set_row_count(total_matches); + out_result.deferred_columns.resize(def_count); + for (size_t d = 0; d < def_count; ++d) { + out_result.deferred_columns[d].pages.resize(def_pages_needed); + out_result.deferred_columns[d].set_row_count(total_matches); } - // Parallel page allocation - each thread allocates its own pages + // Set source metadata for materialized columns + for (const auto &src : mat_sources) { + out_result.materialized[src.mat_col_idx].source_table = + src.base_table_id; + out_result.materialized[src.mat_col_idx].source_column = + src.base_column_idx; + } + + const size_t num_threads = THREAD_COUNT; + + // Parallel page allocation worker_pool().execute([&](size_t t) { - for (size_t c = 0; c < num_cols; ++c) { - auto &col = results.columns[c]; - for (size_t p = t; p < total_pages_needed; p += num_threads) { + for (size_t c = 0; c < mat_count; ++c) { + auto &col = out_result.materialized[c]; + for (size_t p = t; p < mat_pages_needed; p += num_threads) { void *ptr = Contest::platform::get_arena(t) .alloc_chunk(); col.pages[p] = reinterpret_cast(ptr); } } - // Allocate row ID pages - for (size_t r = 0; r < num_rowid_cols; ++r) { - auto &rid_col = results.row_ids[r]; - for (size_t p = t; p < total_pages_needed; p += num_threads) { + for (size_t d = 0; d < def_count; ++d) { + auto &def_col = out_result.deferred_columns[d]; + for (size_t p = t; p < def_pages_needed; p += num_threads) { void *ptr = Contest::platform::get_arena(t) - .alloc_chunk(); - rid_col.pages[p] = reinterpret_cast(ptr); + .alloc_chunk< + Contest::platform::ChunkType::DEFERRED_PAGE>(); + def_col.pages[p] = reinterpret_cast(ptr); } } }); - // Parallel: each thread processes its own buffer + // ======================================================================== + // COLUMN-MAJOR PARALLEL POPULATION + // ======================================================================== worker_pool().execute([&](size_t t) { if (t >= buffers.size()) return; @@ -261,89 +536,106 @@ inline void construct_intermediate_from_buffers( return; size_t start = buffer_starts[t]; - Contest::ColumnarReader::Cursor cursor; + ColumnarReader::Cursor cursor; - // Process data columns - for (size_t c = 0; c < num_cols; ++c) { - const auto &src = sources[c]; - auto &dest_col = results.columns[c]; + // ==================================================================== + // Process MATERIALIZED columns (column-major for cache locality) + // ==================================================================== + for (const auto &src : mat_sources) { + auto &dest_col = out_result.materialized[src.mat_col_idx]; - auto left_range = buf.left_range(); - auto right_range = buf.right_range(); + // Get appropriate range based on which side this column comes from + auto range = src.from_build ? buf.left_range() : buf.right_range(); if (src.is_columnar) { + // Columnar source - use ColumnarReader with cursor caching const auto &col = *src.columnar_col; - if (src.from_build) { - size_t k = start; - for (uint32_t rid : left_range) { - dest_col.write_at(k++, - columnar_reader.read_value( - col, src.remapped_col_idx, rid, - col.type, cursor, true)); - } - } else { - size_t k = start; - for (uint32_t rid : right_range) { - dest_col.write_at(k++, - columnar_reader.read_value( - col, src.remapped_col_idx, rid, - col.type, cursor, false)); - } + size_t k = start; + for (uint32_t rid : range) { + dest_col.write_at(k++, + columnar_reader.read_value( + col, src.child_output_idx, rid, + src.type, cursor, src.from_build)); } - } else { + } else if (src.intermediate_col) { + // Intermediate materialized source - direct copy const auto &vec = *src.intermediate_col; - if (src.from_build) { - size_t k = start; - for (uint32_t rid : left_range) { - dest_col.write_at(k++, vec[rid]); - } - } else { - size_t k = start; - for (uint32_t rid : right_range) { - dest_col.write_at(k++, vec[rid]); + size_t k = start; + for (uint32_t rid : range) { + dest_col.write_at(k++, vec[rid]); + } + } else if (src.needs_deferred_resolve && src.deferred_resolve_col) { + // Deferred in child - resolve via provenance + const auto &def_col = *src.deferred_resolve_col; + size_t k = start; + for (uint32_t rid : range) { + uint64_t prov = def_col[rid]; + uint8_t base_tid = DeferredProvenance::table(prov); + uint8_t base_col = DeferredProvenance::column(prov); + uint64_t base_row = DeferredProvenance::row(prov); + + if (analyzed_plan.original_plan) [[likely]] { + const auto &base_table = + analyzed_plan.original_plan->inputs[base_tid]; + mema::value_t val = + columnar_reader.read_value_direct_public( + base_table.columns[base_col], + static_cast(base_row), src.type); + dest_col.write_at(k++, val); + } else { + dest_col.write_at( + k++, mema::value_t{mema::value_t::NULL_VALUE}); } } } } - // Process row ID columns - for (size_t r = 0; r < num_rowid_cols; ++r) { - const auto &rid_src = rowid_sources[r]; - auto &dest_rid_col = results.row_ids[r]; + // ==================================================================== + // Process DEFERRED columns (column-major with SIMD batch encoding) + // ==================================================================== + for (size_t d = 0; d < deferred_sources.size(); ++d) { + const auto &def_src = deferred_sources[d]; + auto &dest_def_col = out_result.deferred_columns[d]; - auto left_range = buf.left_range(); - auto right_range = buf.right_range(); + if (def_src.needs_encode) { + // Fresh encoding from columnar input - use SIMD batch + auto batch_reader = def_src.from_build + ? buf.left_batch_reader() + : buf.right_batch_reader(); - if (rid_src.from_build) { size_t k = start; - if (rid_src.needs_encode) { - // Columnar build: encode GlobalRowId on-the-fly - for (uint32_t local_idx : left_range) { - dest_rid_col.write_at( - k++, - GlobalRowId::encode(rid_src.table_id, local_idx)); - } - } else { - // Intermediate build: copy from source row ID column - const auto &src_col = *rid_src.source_col; - for (uint32_t local_idx : left_range) { - dest_rid_col.write_at(k++, src_col[local_idx]); + while (batch_reader.has_more()) { + size_t batch_count; + // Request larger batches for SIMD efficiency + constexpr size_t MAX_BATCH = + simd_provenance::BATCH_SIZE > 0 ? 64 : 256; + const uint32_t *row_ids = + batch_reader.get_batch(MAX_BATCH, batch_count); + + if (batch_count > 0) { + simd_provenance::encode_provenance_batch( + dest_def_col, k, row_ids, batch_count, + def_src.base_table_id, def_src.base_column_idx); + k += batch_count; } } - } else { + } else if (def_src.source_col) { + // Copy existing provenance from child intermediate + auto batch_reader = def_src.from_build + ? buf.left_batch_reader() + : buf.right_batch_reader(); + size_t k = start; - if (rid_src.needs_encode) { - // Columnar probe: encode GlobalRowId on-the-fly - for (uint32_t local_idx : right_range) { - dest_rid_col.write_at( - k++, - GlobalRowId::encode(rid_src.table_id, local_idx)); - } - } else { - // Intermediate probe: copy from source row ID column - const auto &src_col = *rid_src.source_col; - for (uint32_t local_idx : right_range) { - dest_rid_col.write_at(k++, src_col[local_idx]); + while (batch_reader.has_more()) { + size_t batch_count; + const uint32_t *row_ids = + batch_reader.get_batch(256, batch_count); + + if (batch_count > 0) { + simd_provenance::copy_provenance_batch( + dest_def_col, k, *def_src.source_col, row_ids, + batch_count); + k += batch_count; } } } @@ -351,4 +643,5 @@ inline void construct_intermediate_from_buffers( }); } -} // namespace Contest::materialize +} // namespace materialize +} // namespace Contest diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h index 6d4a3be..dca7f49 100644 --- a/include/materialization/materialize.h +++ b/include/materialization/materialize.h @@ -1,67 +1,150 @@ /** * @file materialize.h - * @brief Materialization of join results into ColumnarTable format. + * @brief Final materialization for execution path. * - * Parallel materialization using per-thread page builders and mmap allocation. - * Templated on MatchCollectionMode for zero-overhead mode selection. + * Materializes all output columns at the root join, resolving deferred + * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back + * to base tables. + * + * @see construct_intermediate.h for building IntermediateResult intermediates. */ #pragma once -#include #include +#include +#include +#include + #include +#include #include -#include -#include -#include +#include #include -#include #include +#include #include -#include -#include -/** @namespace Contest::materialize @brief Join result materialization. */ -namespace Contest::materialize { +namespace Contest { +namespace materialize { -using Contest::ExecuteResult; -using Contest::ExtendedResult; using Contest::io::ColumnarReader; -using Contest::join::JoinInput; using Contest::join::MatchCollectionMode; -using Contest::join::resolve_input_source; using Contest::join::ThreadLocalMatchBuffer; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; -/** @brief Creates empty ColumnarTable with correct column types for zero-match - * case. */ +/** + * @brief Collect columns needed from a JoinInput for final materialization. + */ +inline platform::ArenaVector +collect_final_columns(const JoinInput &input, + const platform::ArenaVector &needed, + platform::ThreadArena &arena) { + platform::ArenaVector columns(arena); + if (!input.node) + return columns; + + columns.resize(input.node->output_attrs.size()); + std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); + + if (!input.is_columnar()) + return columns; + + auto *table = std::get(input.data); + for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { + if (i < needed.size() && needed[i]) { + auto [actual_col_idx, _] = input.node->output_attrs[i]; + columns[i] = &table->columns[actual_col_idx]; + } + } + return columns; +} + +/** + * @brief Prepare ColumnarReader for final materialization at root. + * + * Sets up page indices for ALL output columns (since all need materialization + * at root). + */ +inline void prepare_final_columns( + ColumnarReader &reader, const JoinInput &build_input, + const JoinInput &probe_input, const AnalyzedJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left) { + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + if (!build_is_columnar && !probe_is_columnar) + return; + + auto &arena = Contest::platform::get_arena(0); + + // All output columns needed at root + platform::ArenaVector build_needed(arena); + if (build_input.node) { + build_needed.resize(build_input.node->output_attrs.size()); + std::memset(build_needed.data(), 0, build_needed.size()); + } + + platform::ArenaVector probe_needed(arena); + if (probe_input.node) { + probe_needed.resize(probe_input.node->output_attrs.size()); + std::memset(probe_needed.data(), 0, probe_needed.size()); + } + + // Mark ALL columns needed for final materialization + // from_left refers to original left child + // build_is_left tells us if build side is the left child + for (const auto &col : join_node.columns) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + } + + if (build_is_columnar) { + reader.prepare_build( + collect_final_columns(build_input, build_needed, arena)); + } + + if (probe_is_columnar) { + reader.prepare_probe( + collect_final_columns(probe_input, probe_needed, arena)); + } +} + +/** + * @brief Create empty result for zero-match case. + */ inline ColumnarTable create_empty_result( - const std::vector> &remapped_attrs) { + const std::vector> &output_attrs) { ColumnarTable empty_result; empty_result.num_rows = 0; - for (auto [_, data_type] : remapped_attrs) { + for (auto [_, data_type] : output_attrs) { empty_result.columns.emplace_back(data_type); } return empty_result; } /** - * @brief Parallel materialization of a single output column from thread-local - * buffers. + * @brief Materialize a single column from sources. * - * Each thread processes its own buffer directly without merge overhead. + * Handles three cases: + * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index + * 2. MATERIALIZED: Column was materialized in IntermediateResult + * 3. DEFERRED: Resolve via 64-bit provenance to base table * - * @tparam Mode Collection mode for compile-time specialization. - * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. - * @tparam ReaderFunc Callable: (row_id, cursor) -> value_t. + * @tparam Mode Collection mode for compile-time specialization. + * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. + * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t. * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType. - * @param est_bytes_per_row Average bytes per row (4 for INT32, ~35 for - * VARCHAR). */ template -inline void materialize_column_from_buffers( +inline void materialize_column( Column &dest_col, std::vector> &buffers, size_t total_matches, ReaderFunc &&read_value, InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) { @@ -163,133 +246,188 @@ inline void materialize_column_from_buffers( } /** - * @brief Materializes a single output column from thread-local buffers. + * @brief Materialize single output column handling deferred resolution. * - * Dispatcher that determines source location (columnar/intermediate, - * build/probe), selects page builder type, and invokes - * materialize_column_from_buffers<>. VARCHAR handling requires source Column - * pointer for string dereferencing. + * For deferred columns, resolves via 64-bit provenance encoding back to + * base table. * * @tparam Mode Collection mode for compile-time specialization. */ template -inline void materialize_single_column_from_buffers( - Column &dest_col, size_t col_idx, size_t build_size, +inline void materialize_single_column( + Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left, std::vector> &buffers, size_t total_matches, const JoinInput &build_input, const JoinInput &probe_input, - const PlanNode &build_node, const PlanNode &probe_node, - ColumnarReader &columnar_reader, const Plan &plan) { - - auto [input, node, local_idx] = resolve_input_source( - col_idx, build_size, build_input, build_node, probe_input, probe_node); - bool from_build = col_idx < build_size; + const AnalyzedJoinNode &join_node, ColumnarReader &columnar_reader, + const AnalyzedPlan &analyzed_plan) { + + // Find column info + const AnalyzedColumnInfo *col_info = nullptr; + for (const auto &col : join_node.columns) { + if (col.original_idx == col_idx) { + col_info = &col; + break; + } + } - const Column *col_source = nullptr; - const mema::column_t *inter_source = nullptr; + if (!col_info) { + // Fallback - shouldn't happen + return; + } - if (input.is_columnar()) { - auto *table = std::get(input.data); - auto [actual_idx, _] = node.output_attrs[local_idx]; - col_source = &table->columns[actual_idx]; + // Determine if this column comes from build or probe side at runtime + bool from_build = (col_info->from_left == build_is_left); + const JoinInput &src_input = from_build ? build_input : probe_input; + + // Determine how to read the value + const Column *columnar_source = nullptr; + const mema::column_t *materialized_source = nullptr; + const mema::deferred_column_t *deferred_source = nullptr; + + if (src_input.is_columnar()) { + // Direct columnar read + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col_info->child_output_idx]; + columnar_source = &table->columns[actual_idx]; } else { - const auto &res = std::get(input.data); - inter_source = &res.columns[local_idx]; + const auto &ir = std::get(src_input.data); + if (ir.is_materialized(col_info->child_output_idx)) { + // Read from materialized column + materialized_source = + ir.get_materialized(col_info->child_output_idx); + } else if (ir.is_deferred(col_info->child_output_idx)) { + // Deferred - need to resolve via 64-bit provenance + deferred_source = ir.get_deferred(col_info->child_output_idx); + } } - auto reader = [&](uint32_t rid, ColumnarReader::Cursor &cursor, - DataType type) { - if (col_source) { - return columnar_reader.read_value(*col_source, local_idx, rid, type, - cursor, from_build); + // Create reader lambda + auto reader = [&](uint32_t local_row_id, + ColumnarReader::Cursor &cursor) -> mema::value_t { + if (columnar_source) { + return columnar_reader.read_value( + *columnar_source, col_info->child_output_idx, local_row_id, + col_info->type, cursor, from_build); + } else if (materialized_source) { + return (*materialized_source)[local_row_id]; + } else if (deferred_source && analyzed_plan.original_plan) { + // Deferred resolution: decode 64-bit provenance + uint64_t prov = (*deferred_source)[local_row_id]; + uint8_t base_tid = DeferredProvenance::table(prov); + uint8_t base_col = DeferredProvenance::column(prov); + uint64_t base_row = DeferredProvenance::row(prov); + const auto &base_table = + analyzed_plan.original_plan->inputs[base_tid]; + return columnar_reader.read_value( + base_table.columns[base_col], base_col, + static_cast(base_row), col_info->type, cursor, true); } - return (*inter_source)[rid]; + return mema::value_t{mema::value_t::NULL_VALUE}; }; + // Materialize based on type if (dest_col.type == DataType::INT32) { auto init = [](std::function alloc) { return Int32PageBuilder(std::move(alloc)); }; - materialize_column_from_buffers( + materialize_column( dest_col, buffers, total_matches, [&](uint32_t rid, ColumnarReader::Cursor &cursor) { - return reader(rid, cursor, DataType::INT32); + return reader(rid, cursor); }, init, from_build, 4); return; } - const Column *str_src_ptr = col_source; - if (!str_src_ptr && inter_source) { - str_src_ptr = &plan.inputs[inter_source->source_table] - .columns[inter_source->source_column]; + // VARCHAR + const Column *str_src_ptr = columnar_source; + if (!str_src_ptr) { + if (materialized_source) { + str_src_ptr = &analyzed_plan.original_plan + ->inputs[materialized_source->source_table] + .columns[materialized_source->source_column]; + } else if (deferred_source && analyzed_plan.original_plan) { + // For deferred VARCHAR, get source from provenance of first row + // All rows in a deferred column share the same base table/column + str_src_ptr = &analyzed_plan.original_plan + ->inputs[col_info->provenance.base_table_id] + .columns[col_info->provenance.base_column_idx]; + } + } + + if (!str_src_ptr) { + // Shouldn't happen, but handle gracefully + return; } auto init = [str_src_ptr](std::function alloc) { return VarcharPageBuilder(*str_src_ptr, std::move(alloc)); }; - materialize_column_from_buffers( + materialize_column( dest_col, buffers, total_matches, [&](uint32_t rid, ColumnarReader::Cursor &cursor) { - return reader(rid, cursor, DataType::VARCHAR); + return reader(rid, cursor); }, init, from_build, 35); } /** - * @brief Materializes all output columns from thread-local buffers into - * ColumnarTable. - * - * Dereferences VARCHAR value_t references into actual string bytes. + * @brief Materialize all output columns from intermediate result. * - * @tparam Mode Collection mode for compile-time specialization. - * @param buffers Thread-local match buffers from probe. - * @param build_input Build side data source. - * @param probe_input Probe side data source. - * @param remapped_attrs Output projection: (col_idx, DataType) pairs. - * @param build_node Metadata for build side output_attrs mapping. - * @param probe_node Metadata for probe side output_attrs mapping. - * @param build_size Number of columns from build side. - * @param columnar_reader PageIndex-accelerated reader for Column page access. - * @param plan Full query plan for VARCHAR dereferencing. - * @return ColumnarTable with self-contained page data. + * For root join. Resolves all deferred columns by decoding 64-bit provenance + * to base tables. * - * @see construct_intermediate.h for creating intermediate ExecuteResult. - * @see page_builders.h for Int32PageBuilder and VarcharPageBuilder. + * @tparam Mode Collection mode for compile-time specialization. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side input. + * @param probe_input Probe side input. + * @param join_node Analyzed join node with column info. + * @param remapped_attrs Output projection after build/probe remapping. + * @param build_size Number of columns from build side. + * @param columnar_reader Reader for columnar data. + * @param analyzed_plan Full analyzed plan for base table access. + * @return ColumnarTable with final output. */ template inline ColumnarTable materialize_from_buffers( std::vector> &buffers, const JoinInput &build_input, const JoinInput &probe_input, + const AnalyzedJoinNode &join_node, const std::vector> &remapped_attrs, - const PlanNode &build_node, const PlanNode &probe_node, size_t build_size, - ColumnarReader &columnar_reader, const Plan &plan) { + size_t build_size, bool build_is_left, ColumnarReader &columnar_reader, + const AnalyzedPlan &analyzed_plan) { - // Compute total_matches + // Compute total matches size_t total_matches = 0; for (const auto &buf : buffers) { total_matches += buf.count(); } - ColumnarTable result; - result.num_rows = total_matches; - if (total_matches == 0) { - for (auto [_, dtype] : remapped_attrs) { - result.columns.emplace_back(dtype); - } - return result; + return create_empty_result(remapped_attrs); } + ColumnarTable result; + result.num_rows = total_matches; + for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) { auto [col_idx, data_type] = remapped_attrs[out_idx]; result.columns.emplace_back(data_type); Column &dest_col = result.columns.back(); - materialize_single_column_from_buffers( - dest_col, col_idx, build_size, buffers, total_matches, build_input, - probe_input, build_node, probe_node, columnar_reader, plan); + + // Pass out_idx (output position) not col_idx (global column index) + // because materialize_single_column searches by original_idx + // which is the output position in join_node.columns + materialize_single_column(dest_col, out_idx, build_size, + build_is_left, buffers, total_matches, + build_input, probe_input, join_node, + columnar_reader, analyzed_plan); } + return result; } -} // namespace Contest::materialize +} // namespace materialize +} // namespace Contest diff --git a/include/materialization/materialize_deferred.h b/include/materialization/materialize_deferred.h deleted file mode 100644 index bd7a2af..0000000 --- a/include/materialization/materialize_deferred.h +++ /dev/null @@ -1,435 +0,0 @@ -/** - * @file materialize_deferred.h - * @brief Final materialization for deferred execution path. - * - * Materializes all output columns at the root join, resolving deferred - * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back - * to base tables. - * - * @see construct_deferred.h for building DeferredResult intermediates. - * @see materialize.h for the eager materialization equivalent. - */ -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Contest { -namespace materialize { - -using Contest::io::ColumnarReader; -using Contest::join::MatchCollectionMode; -using Contest::join::ThreadLocalMatchBuffer; -using Contest::platform::THREAD_COUNT; -using Contest::platform::worker_pool; - -/** - * @brief Collect columns needed from a DeferredInput for final materialization. - */ -inline platform::ArenaVector -collect_final_columns(const DeferredInput &input, - const platform::ArenaVector &needed, - platform::ThreadArena &arena) { - platform::ArenaVector columns(arena); - if (!input.node) - return columns; - - columns.resize(input.node->output_attrs.size()); - std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); - - if (!input.is_columnar()) - return columns; - - auto *table = std::get(input.data); - for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { - if (i < needed.size() && needed[i]) { - auto [actual_col_idx, _] = input.node->output_attrs[i]; - columns[i] = &table->columns[actual_col_idx]; - } - } - return columns; -} - -/** - * @brief Prepare ColumnarReader for final deferred materialization at root. - * - * Sets up page indices for ALL output columns (since all need materialization - * at root). - */ -inline void prepare_final_deferred_columns( - ColumnarReader &reader, const DeferredInput &build_input, - const DeferredInput &probe_input, const DeferredJoinNode &join_node, - const std::vector> &remapped_attrs, - size_t build_size, bool build_is_left) { - - bool build_is_columnar = build_input.is_columnar(); - bool probe_is_columnar = probe_input.is_columnar(); - - if (!build_is_columnar && !probe_is_columnar) - return; - - auto &arena = Contest::platform::get_arena(0); - - // All output columns needed at root - platform::ArenaVector build_needed(arena); - if (build_input.node) { - build_needed.resize(build_input.node->output_attrs.size()); - std::memset(build_needed.data(), 0, build_needed.size()); - } - - platform::ArenaVector probe_needed(arena); - if (probe_input.node) { - probe_needed.resize(probe_input.node->output_attrs.size()); - std::memset(probe_needed.data(), 0, probe_needed.size()); - } - - // Mark ALL columns needed for final materialization - // from_left refers to original left child - // build_is_left tells us if build side is the left child - for (const auto &col : join_node.columns) { - bool from_build = (col.from_left == build_is_left); - if (from_build && col.child_output_idx < build_needed.size()) { - build_needed[col.child_output_idx] = 1; - } else if (!from_build && col.child_output_idx < probe_needed.size()) { - probe_needed[col.child_output_idx] = 1; - } - } - - if (build_is_columnar) { - reader.prepare_build( - collect_final_columns(build_input, build_needed, arena)); - } - - if (probe_is_columnar) { - reader.prepare_probe( - collect_final_columns(probe_input, probe_needed, arena)); - } -} - -/** - * @brief Create empty result for zero-match case in deferred path. - */ -inline ColumnarTable create_empty_deferred_final( - const std::vector> &output_attrs) { - ColumnarTable empty_result; - empty_result.num_rows = 0; - for (auto [_, data_type] : output_attrs) { - empty_result.columns.emplace_back(data_type); - } - return empty_result; -} - -/** - * @brief Materialize a single column from deferred sources. - * - * Handles three cases: - * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index - * 2. MATERIALIZED: Column was materialized in DeferredResult - * 3. DEFERRED: Resolve via 64-bit provenance to base table - * - * @tparam Mode Collection mode for compile-time specialization. - * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. - * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t. - * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType. - */ -template -inline void materialize_deferred_column( - Column &dest_col, std::vector> &buffers, - size_t total_matches, ReaderFunc &&read_value, - InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) { - - if (total_matches == 0) - return; - - const int num_threads = THREAD_COUNT; - - size_t matches_per_thread = (total_matches + num_threads - 1) / num_threads; - size_t usable_per_page = PAGE_SIZE - 256; - size_t rows_per_page = std::max(1ul, usable_per_page / est_bytes_per_row); - size_t pages_per_thread = - (matches_per_thread + rows_per_page - 1) / rows_per_page + 10; - size_t total_pages = pages_per_thread * num_threads; - - void *page_memory = - mmap(nullptr, total_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (page_memory == MAP_FAILED) - throw std::bad_alloc(); - - std::vector thread_columns; - thread_columns.reserve(num_threads); - for (int i = 0; i < num_threads; ++i) { - thread_columns.emplace_back(dest_col.type); - } - - worker_pool().execute([&](size_t t) { - if (t >= buffers.size()) - return; - auto &buf = buffers[t]; - size_t my_count = buf.count(); - if (my_count == 0) - return; - - Column &local_col = thread_columns[t]; - - size_t thread_page_start = t * pages_per_thread; - size_t thread_page_limit = pages_per_thread; - size_t used_pages = 0; - - ColumnarReader::Cursor cursor; - - auto page_allocator = [&]() -> Page * { - Page *p; - if (used_pages < thread_page_limit) { - p = reinterpret_cast(static_cast(page_memory) + - (thread_page_start + used_pages) * - PAGE_SIZE); - used_pages++; - } else { - p = new Page(); - } - local_col.pages.push_back(p); - return p; - }; - - BuilderType builder = init_builder(page_allocator); - builder.prepare(my_count); - - const size_t check_interval = BuilderType::MIN_ROWS_PER_PAGE_CHECK; - size_t rows_since_check = 0; - - auto range = from_build ? buf.left_range() : buf.right_range(); - - for (uint32_t row_id : range) { - bool flushed = builder.add(read_value(row_id, cursor)); - - if (flushed) { - rows_since_check = 0; - } else { - rows_since_check++; - if (rows_since_check >= check_interval) { - if (builder.should_check_overflow()) { - builder.save_to_page(builder.current_page); - rows_since_check = 0; - } - if (rows_since_check > check_interval * 2) - rows_since_check = 0; - } - } - } - - if (builder.num_rows != 0) { - builder.save_to_page(builder.current_page); - } - }); - - for (auto &thread_col : thread_columns) { - for (auto *page : thread_col.pages) { - dest_col.pages.push_back(page); - } - thread_col.pages.clear(); - } - - auto *mapped_mem = new MappedMemory(page_memory, total_pages * PAGE_SIZE); - dest_col.assign_mapped_memory(mapped_mem); -} - -/** - * @brief Materialize single output column handling deferred resolution. - * - * For deferred columns, resolves via 64-bit provenance encoding back to - * base table. - * - * @tparam Mode Collection mode for compile-time specialization. - */ -template -inline void materialize_single_deferred_column( - Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left, - std::vector> &buffers, size_t total_matches, - const DeferredInput &build_input, const DeferredInput &probe_input, - const DeferredJoinNode &join_node, ColumnarReader &columnar_reader, - const DeferredPlan &deferred_plan) { - - // Find column info - const DeferredColumnInfo *col_info = nullptr; - for (const auto &col : join_node.columns) { - if (col.original_idx == col_idx) { - col_info = &col; - break; - } - } - - if (!col_info) { - // Fallback - shouldn't happen - return; - } - - // Determine if this column comes from build or probe side at runtime - bool from_build = (col_info->from_left == build_is_left); - const DeferredInput &src_input = from_build ? build_input : probe_input; - - // Determine how to read the value - const Column *columnar_source = nullptr; - const mema::column_t *materialized_source = nullptr; - const mema::deferred_column_t *deferred_source = nullptr; - - if (src_input.is_columnar()) { - // Direct columnar read - const auto *table = std::get(src_input.data); - auto [actual_idx, _] = - src_input.node->output_attrs[col_info->child_output_idx]; - columnar_source = &table->columns[actual_idx]; - } else { - const auto &ir = std::get(src_input.data); - if (ir.is_materialized(col_info->child_output_idx)) { - // Read from materialized column - materialized_source = - ir.get_materialized(col_info->child_output_idx); - } else if (ir.is_deferred(col_info->child_output_idx)) { - // Deferred - need to resolve via 64-bit provenance - deferred_source = ir.get_deferred(col_info->child_output_idx); - } - } - - // Create reader lambda - auto reader = [&](uint32_t local_row_id, - ColumnarReader::Cursor &cursor) -> mema::value_t { - if (columnar_source) { - return columnar_reader.read_value( - *columnar_source, col_info->child_output_idx, local_row_id, - col_info->type, cursor, from_build); - } else if (materialized_source) { - return (*materialized_source)[local_row_id]; - } else if (deferred_source && deferred_plan.original_plan) { - // Deferred resolution: decode 64-bit provenance - uint64_t prov = (*deferred_source)[local_row_id]; - uint8_t base_tid = DeferredProvenance::table(prov); - uint8_t base_col = DeferredProvenance::column(prov); - uint64_t base_row = DeferredProvenance::row(prov); - const auto &base_table = - deferred_plan.original_plan->inputs[base_tid]; - return columnar_reader.read_value( - base_table.columns[base_col], base_col, - static_cast(base_row), col_info->type, cursor, true); - } - return mema::value_t{mema::value_t::NULL_VALUE}; - }; - - // Materialize based on type - if (dest_col.type == DataType::INT32) { - auto init = [](std::function alloc) { - return Int32PageBuilder(std::move(alloc)); - }; - materialize_deferred_column( - dest_col, buffers, total_matches, - [&](uint32_t rid, ColumnarReader::Cursor &cursor) { - return reader(rid, cursor); - }, - init, from_build, 4); - return; - } - - // VARCHAR - const Column *str_src_ptr = columnar_source; - if (!str_src_ptr) { - if (materialized_source) { - str_src_ptr = &deferred_plan.original_plan - ->inputs[materialized_source->source_table] - .columns[materialized_source->source_column]; - } else if (deferred_source && deferred_plan.original_plan) { - // For deferred VARCHAR, get source from provenance of first row - // All rows in a deferred column share the same base table/column - str_src_ptr = &deferred_plan.original_plan - ->inputs[col_info->provenance.base_table_id] - .columns[col_info->provenance.base_column_idx]; - } - } - - if (!str_src_ptr) { - // Shouldn't happen, but handle gracefully - return; - } - - auto init = [str_src_ptr](std::function alloc) { - return VarcharPageBuilder(*str_src_ptr, std::move(alloc)); - }; - - materialize_deferred_column( - dest_col, buffers, total_matches, - [&](uint32_t rid, ColumnarReader::Cursor &cursor) { - return reader(rid, cursor); - }, - init, from_build, 35); -} - -/** - * @brief Materialize all output columns from deferred intermediate. - * - * For root join in deferred execution path. Resolves all deferred columns - * by decoding 64-bit provenance to base tables. - * - * @tparam Mode Collection mode for compile-time specialization. - * @param buffers Thread-local match buffers from probe. - * @param build_input Build side deferred input. - * @param probe_input Probe side deferred input. - * @param join_node Deferred join node with column info. - * @param remapped_attrs Output projection after build/probe remapping. - * @param build_size Number of columns from build side. - * @param columnar_reader Reader for columnar data. - * @param deferred_plan Full deferred plan for base table access. - * @return ColumnarTable with final output. - */ -template -inline ColumnarTable materialize_deferred_from_buffers( - std::vector> &buffers, - const DeferredInput &build_input, const DeferredInput &probe_input, - const DeferredJoinNode &join_node, - const std::vector> &remapped_attrs, - size_t build_size, bool build_is_left, ColumnarReader &columnar_reader, - const DeferredPlan &deferred_plan) { - - // Compute total matches - size_t total_matches = 0; - for (const auto &buf : buffers) { - total_matches += buf.count(); - } - - if (total_matches == 0) { - return create_empty_deferred_final(remapped_attrs); - } - - ColumnarTable result; - result.num_rows = total_matches; - - for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) { - auto [col_idx, data_type] = remapped_attrs[out_idx]; - result.columns.emplace_back(data_type); - Column &dest_col = result.columns.back(); - - // Pass out_idx (output position) not col_idx (global column index) - // because materialize_single_deferred_column searches by original_idx - // which is the output position in join_node.columns - materialize_single_deferred_column( - dest_col, out_idx, build_size, build_is_left, buffers, - total_matches, build_input, probe_input, join_node, columnar_reader, - deferred_plan); - } - - return result; -} - -} // namespace materialize -} // namespace Contest diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp index f511c60..3c7fc83 100644 --- a/src/analyze_plan.cpp +++ b/src/analyze_plan.cpp @@ -6,9 +6,8 @@ * materialized eagerly (join keys needed by parent) vs deferred until final * output. Traces column provenance back to base tables for deferred resolution. * - * @see deferred_plan.h for DeferredPlan structure. + * @see deferred_plan.h for AnalyzedPlan structure. */ -#include #include #include @@ -112,7 +111,7 @@ find_parent_join_key(const Plan &plan, size_t node_idx, * LEFT_ONLY/RIGHT_ONLY. */ join::MatchCollectionMode -compute_base_collection_mode(const std::vector &columns, +compute_base_collection_mode(const std::vector &columns, size_t left_output_size) { bool needs_left = false; bool needs_right = false; @@ -137,11 +136,11 @@ compute_base_collection_mode(const std::vector &columns, } // anonymous namespace -DeferredPlan analyze_plan(const Plan &plan) { - DeferredPlan deferred; - deferred.original_plan = &plan; - deferred.nodes.resize(plan.nodes.size()); - deferred.root = plan.root; +AnalyzedPlan analyze_plan(const Plan &plan) { + AnalyzedPlan analyzed; + analyzed.original_plan = &plan; + analyzed.nodes.resize(plan.nodes.size()); + analyzed.root = plan.root; auto parent_map = build_parent_map(plan); @@ -170,26 +169,26 @@ DeferredPlan analyze_plan(const Plan &plan) { if (const auto *scan = std::get_if(&node.data)) { // Scan node: simple wrapper - DeferredScanNode dscan; - dscan.node_idx = node_idx; - dscan.base_table_id = scan->base_table_id; - dscan.output_attrs = node.output_attrs; - deferred.nodes[node_idx] = std::move(dscan); + AnalyzedScanNode ascan; + ascan.node_idx = node_idx; + ascan.base_table_id = scan->base_table_id; + ascan.output_attrs = node.output_attrs; + analyzed.nodes[node_idx] = std::move(ascan); } else { // Join node: compute materialization decisions const auto &join = std::get(node.data); - DeferredJoinNode djoin; - djoin.node_idx = node_idx; - djoin.left_child_idx = join.left; - djoin.right_child_idx = join.right; - djoin.left_join_attr = join.left_attr; - djoin.right_join_attr = join.right_attr; - djoin.output_attrs = node.output_attrs; - djoin.is_root = (node_idx == plan.root); + AnalyzedJoinNode ajoin; + ajoin.node_idx = node_idx; + ajoin.left_child_idx = join.left; + ajoin.right_child_idx = join.right; + ajoin.left_join_attr = join.left_attr; + ajoin.right_join_attr = join.right_attr; + ajoin.output_attrs = node.output_attrs; + ajoin.is_root = (node_idx == plan.root); // Find which column parent needs as join key - djoin.parent_join_key_idx = + ajoin.parent_join_key_idx = find_parent_join_key(plan, node_idx, parent_map); // Get child sizes for determining column source @@ -200,7 +199,7 @@ DeferredPlan analyze_plan(const Plan &plan) { for (size_t i = 0; i < node.output_attrs.size(); ++i) { auto [col_idx, col_type] = node.output_attrs[i]; - DeferredColumnInfo info; + AnalyzedColumnInfo info; info.original_idx = i; info.type = col_type; @@ -220,11 +219,11 @@ DeferredPlan analyze_plan(const Plan &plan) { // Materialization decision: // - At root: ALL columns must be materialized (final output) // - At intermediate: only parent's join key is materialized - if (djoin.is_root) { + if (ajoin.is_root) { // Root node: materialize everything info.resolution = ColumnResolution::MATERIALIZE; - } else if (djoin.parent_join_key_idx.has_value() && - i == *djoin.parent_join_key_idx) { + } else if (ajoin.parent_join_key_idx.has_value() && + i == *ajoin.parent_join_key_idx) { info.resolution = ColumnResolution::MATERIALIZE; } else { info.resolution = ColumnResolution::DEFER; @@ -233,22 +232,22 @@ DeferredPlan analyze_plan(const Plan &plan) { // Trace provenance to base table info.provenance = trace_provenance(plan, node_idx, i); - djoin.columns.push_back(std::move(info)); + ajoin.columns.push_back(std::move(info)); } // Compute collection mode and count deferred columns - djoin.base_collection_mode = - compute_base_collection_mode(djoin.columns, left_size); + ajoin.base_collection_mode = + compute_base_collection_mode(ajoin.columns, left_size); // Count deferred columns for pre-allocation - djoin.num_deferred_columns = 0; - for (const auto &col : djoin.columns) { + ajoin.num_deferred_columns = 0; + for (const auto &col : ajoin.columns) { if (col.resolution == ColumnResolution::DEFER) { - ++djoin.num_deferred_columns; + ++ajoin.num_deferred_columns; } } - deferred.nodes[node_idx] = std::move(djoin); + analyzed.nodes[node_idx] = std::move(ajoin); } } @@ -256,28 +255,28 @@ DeferredPlan analyze_plan(const Plan &plan) { // Process in reverse post-order (parents before children) for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) { size_t node_idx = *it; - auto *djoin = std::get_if(&deferred.nodes[node_idx]); - if (!djoin) + auto *ajoin = std::get_if(&analyzed.nodes[node_idx]); + if (!ajoin) continue; // For each column that must be MATERIALIZE, ensure the child also // materializes it - for (const auto &col : djoin->columns) { + for (const auto &col : ajoin->columns) { if (col.resolution != ColumnResolution::MATERIALIZE) continue; // Find which child this column comes from size_t child_idx = - col.from_left ? djoin->left_child_idx : djoin->right_child_idx; + col.from_left ? ajoin->left_child_idx : ajoin->right_child_idx; - auto *child_djoin = - std::get_if(&deferred.nodes[child_idx]); - if (!child_djoin) + auto *child_ajoin = + std::get_if(&analyzed.nodes[child_idx]); + if (!child_ajoin) continue; // Child is a scan - always has data // Mark child's column as MATERIALIZE - if (col.child_output_idx < child_djoin->columns.size()) { - child_djoin->columns[col.child_output_idx].resolution = + if (col.child_output_idx < child_ajoin->columns.size()) { + child_ajoin->columns[col.child_output_idx].resolution = ColumnResolution::MATERIALIZE; } } @@ -285,19 +284,19 @@ DeferredPlan analyze_plan(const Plan &plan) { // PASS 3: Recount num_deferred_columns after propagation for (size_t node_idx : post_order) { - auto *djoin = std::get_if(&deferred.nodes[node_idx]); - if (!djoin) + auto *ajoin = std::get_if(&analyzed.nodes[node_idx]); + if (!ajoin) continue; - djoin->num_deferred_columns = 0; - for (const auto &col : djoin->columns) { + ajoin->num_deferred_columns = 0; + for (const auto &col : ajoin->columns) { if (col.resolution == ColumnResolution::DEFER) { - ++djoin->num_deferred_columns; + ++ajoin->num_deferred_columns; } } } - return deferred; + return analyzed; } } // namespace Contest diff --git a/src/execute.cpp b/src/execute.cpp index b9d45e5..29a485c 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -5,12 +5,12 @@ * Traverses plan tree: resolve inputs -> select build/probe -> algorithm * selection -> match collection -> output construction. * - * Flow: execute() -> execute_impl() recursively -> resolve_join_input() for - * ScanNode (ColumnarTable*) or JoinNode (ExecuteResult). Root produces - * ColumnarTable; non-root produces ExecuteResult. + * Flow: execute() -> execute_impl() recursively -> resolve_input() for + * ScanNode (ColumnarTable*) or JoinNode (IntermediateResult). Root produces + * ColumnarTable; non-root produces IntermediateResult. * - * Lifetimes: base tables live for query duration; ExecuteResult held on stack - * until parent completes; VARCHAR refs valid via base table lifetime. + * Lifetimes: base tables live for query duration; IntermediateResult held on + * stack until parent completes; VARCHAR refs valid via base table lifetime. * * Row order non-deterministic (work-stealing); semantically correct per SQL. * @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -40,309 +41,54 @@ #include #include -#ifdef USE_DEFERRED_MATERIALIZATION -#include -#include -#include -#include -#endif - namespace Contest { using namespace join; using materialize::construct_intermediate_from_buffers; -using materialize::create_empty_result; +using materialize::create_empty_intermediate_result; using materialize::materialize_from_buffers; /** - * @brief Result variant: ExtendedResult (intermediate, with row ID tracking) or - * ColumnarTable (final output per contest API). + * @brief Result variant: IntermediateResult (non-root) or ColumnarTable (root). */ -using JoinResult = std::variant; +using JoinResult = std::variant; -/** - * @brief Recursive join execution with timing. - * @param plan Query plan with nodes and base tables. - * @param node_idx Current node index in plan.nodes. - * @param is_root True -> ColumnarTable output; false -> ExecuteResult. - * @param stats Timing accumulator. - * @return JoinResult (intermediate or final). - */ -JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, +// Forward declaration +JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, TimingStats &stats); /** * @brief Resolve plan node to JoinInput. * * ScanNode -> non-owning ColumnarTable*; JoinNode -> recursive execution - * returning owned ExtendedResult. Implements depth-first traversal. - * - * @param plan Query plan. - * @param node_idx Node index to resolve. - * @param stats Timing accumulator. - * @return JoinInput with data variant and metadata. + * returning owned IntermediateResult. */ -JoinInput resolve_join_input(const Plan &plan, size_t node_idx, - TimingStats &stats) { +JoinInput resolve_input(const AnalyzedPlan &plan, size_t node_idx, + TimingStats &stats) { JoinInput input; - const auto &node = plan.nodes[node_idx]; - input.node = &node; + const auto &anode = plan[node_idx]; + const auto &pnode = plan.original_plan->nodes[node_idx]; + input.node = &pnode; + input.analyzed_node = &anode; - if (const auto *scan = std::get_if(&node.data)) { - input.data = &plan.inputs[scan->base_table_id]; + if (const auto *scan = std::get_if(&anode)) { + input.data = &plan.original_plan->inputs[scan->base_table_id]; input.table_id = scan->base_table_id; } else { auto result = execute_impl(plan, node_idx, false, stats); - input.data = std::get(std::move(result)); + input.data = std::get(std::move(result)); input.table_id = 0; } return input; } /** - * @brief Unified probe + materialize helper templated on collection mode. - * - * Executes probe (nested loop or hash join) and materialization/intermediate - * construction in a single function. Template parameter eliminates runtime - * branching in hot loops. - * - * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY). - */ -template -JoinResult execute_join_with_mode( - bool use_nested_loop, bool probe_is_columnar, bool is_root, - const UnchainedHashtable *hash_table, const JoinInput &build_input, - const JoinInput &probe_input, const BuildProbeConfig &config, - const PlanNode &build_node, const PlanNode &probe_node, JoinSetup &setup, - io::ColumnarReader &columnar_reader, const Plan &plan, TimingStats &stats) { - - std::vector> match_buffers; - - if (use_nested_loop) { - auto nested_loop_start = std::chrono::high_resolution_clock::now(); - match_buffers = nested_loop_join( - build_input, probe_input, config.build_attr, config.probe_attr); - auto nested_loop_end = std::chrono::high_resolution_clock::now(); - stats.nested_loop_join_ms += - std::chrono::duration_cast( - nested_loop_end - nested_loop_start) - .count(); - } else { - auto probe_start = std::chrono::high_resolution_clock::now(); - if (probe_is_columnar) { - match_buffers = probe_columnar(*hash_table, probe_input, - config.probe_attr); - } else { - const auto &probe_result = - std::get(probe_input.data); - match_buffers = probe_intermediate( - *hash_table, probe_result.columns[config.probe_attr]); - } - auto probe_end = std::chrono::high_resolution_clock::now(); - stats.hash_join_probe_ms += - std::chrono::duration_cast(probe_end - - probe_start) - .count(); - } - - size_t total_matches = 0; - for (const auto &buf : match_buffers) { - total_matches += buf.count(); - } - - if (is_root) { - auto mat_start = std::chrono::high_resolution_clock::now(); - JoinResult final_result; - if (total_matches == 0) { - final_result = create_empty_result(config.remapped_attrs); - } else { - prepare_output_columns( - columnar_reader, build_input, probe_input, build_node, - probe_node, config.remapped_attrs, build_input.output_size()); - - final_result = materialize_from_buffers( - match_buffers, build_input, probe_input, config.remapped_attrs, - build_node, probe_node, build_input.output_size(), - columnar_reader, plan); - } - auto mat_end = std::chrono::high_resolution_clock::now(); - stats.materialize_ms += - std::chrono::duration_cast(mat_end - - mat_start) - .count(); - return final_result; - } else { - auto inter_start = std::chrono::high_resolution_clock::now(); - if (total_matches > 0) { - prepare_output_columns( - columnar_reader, build_input, probe_input, build_node, - probe_node, config.remapped_attrs, build_input.output_size()); - - construct_intermediate_from_buffers( - match_buffers, build_input, probe_input, config.remapped_attrs, - build_node, probe_node, build_input.output_size(), - columnar_reader, setup.results, setup.merged_table_ids); - } - auto inter_end = std::chrono::high_resolution_clock::now(); - stats.intermediate_ms += - std::chrono::duration_cast(inter_end - - inter_start) - .count(); - return std::move(setup.results); - } -} - -/** - * @brief Core recursive join execution. - * - * Phases: resolve L/R inputs -> select build/probe (smaller=build) -> algorithm - * choice -> build/probe -> output construction. - * - * Algorithm: nested loop if build_rows < HASH_TABLE_THRESHOLD (8); else radix- - * partitioned hash join. - * - * Memory: hash table and MatchCollector local (freed on return); child - * ExecuteResults on stack until materialization; setup.results pre-allocated. + * @brief Select build/probe sides for join input. */ -JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, - TimingStats &stats) { - auto &node = plan.nodes[node_idx]; - - if (!std::holds_alternative(node.data)) { - return ExtendedResult{}; - } - - const auto &join = std::get(node.data); - const auto &output_attrs = node.output_attrs; - const auto &left_node = plan.nodes[join.left]; - const auto &right_node = plan.nodes[join.right]; - - JoinInput left_input = resolve_join_input(plan, join.left, stats); - JoinInput right_input = resolve_join_input(plan, join.right, stats); - - /* Build/probe selection: smaller input = build side; remaps output_attrs. - */ - auto setup_start = std::chrono::high_resolution_clock::now(); - auto config = - select_build_probe_side(join, left_input, right_input, output_attrs); - const JoinInput &build_input = config.build_left ? left_input : right_input; - const JoinInput &probe_input = config.build_left ? right_input : left_input; - const auto &build_node = config.build_left ? left_node : right_node; - const auto &probe_node = config.build_left ? right_node : left_node; - - bool build_is_columnar = build_input.is_columnar(); - bool probe_is_columnar = probe_input.is_columnar(); - - /* Nested loop for <8 rows (L1-resident, no hash overhead, SIMD). */ - const size_t HASH_TABLE_THRESHOLD = 8; - size_t build_rows = build_input.row_count(config.build_attr); - bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD); - - /* Pre-allocate ExecuteResult; ColumnarReader PageIndex built lazily. */ - JoinSetup setup = setup_join(build_input, probe_input, build_node, - probe_node, left_node, right_node, left_input, - right_input, output_attrs, build_rows); - auto setup_end = std::chrono::high_resolution_clock::now(); - auto setup_elapsed = std::chrono::duration_cast( - setup_end - setup_start); - stats.setup_ms += setup_elapsed.count(); - - /* Skip unused-side row IDs if output needs only one side (50% savings). */ - MatchCollectionMode collection_mode = determine_collection_mode( - config.remapped_attrs, config.build_left ? left_input.output_size() - : right_input.output_size()); - - /* Build hash table if needed (before mode dispatch). */ - std::optional hash_table; - if (!use_nested_loop) { - auto build_start = std::chrono::high_resolution_clock::now(); - hash_table = - build_is_columnar - ? build_from_columnar(build_input, config.build_attr) - : build_from_intermediate(build_input, config.build_attr); - auto build_end = std::chrono::high_resolution_clock::now(); - stats.hashtable_build_ms += - std::chrono::duration_cast(build_end - - build_start) - .count(); - } - - /* Dispatch based on collection mode - single runtime branch, then - * fully specialized template instantiation with zero branching in hot - * loops. */ - switch (collection_mode) { - case MatchCollectionMode::BOTH: - return execute_join_with_mode( - use_nested_loop, probe_is_columnar, is_root, - use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, build_node, probe_node, setup, - setup.columnar_reader, plan, stats); - - case MatchCollectionMode::LEFT_ONLY: - return execute_join_with_mode( - use_nested_loop, probe_is_columnar, is_root, - use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, build_node, probe_node, setup, - setup.columnar_reader, plan, stats); - - case MatchCollectionMode::RIGHT_ONLY: - return execute_join_with_mode( - use_nested_loop, probe_is_columnar, is_root, - use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, build_node, probe_node, setup, - setup.columnar_reader, plan, stats); - } - - // Should never reach here, but satisfy compiler - return ExtendedResult{}; -} - -#ifdef USE_DEFERRED_MATERIALIZATION -// ============================================================================ -// DEFERRED MATERIALIZATION PATH -// ============================================================================ - -using DeferredJoinResult = std::variant; - -using materialize::construct_deferred_from_buffers; -using materialize::create_empty_deferred_result; -using materialize::materialize_deferred_from_buffers; - -// Forward declaration -DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, - size_t node_idx, bool is_root, - TimingStats &stats); - -/** - * @brief Resolve deferred plan node to DeferredInput. - */ -DeferredInput resolve_deferred_input(const DeferredPlan &deferred_plan, - size_t node_idx, TimingStats &stats) { - DeferredInput input; - const auto &dnode = deferred_plan[node_idx]; - const auto &pnode = deferred_plan.original_plan->nodes[node_idx]; - input.node = &pnode; - input.deferred_node = &dnode; - - if (const auto *dscan = std::get_if(&dnode)) { - input.data = &deferred_plan.original_plan->inputs[dscan->base_table_id]; - input.table_id = dscan->base_table_id; - } else { - auto result = - execute_deferred_impl(deferred_plan, node_idx, false, stats); - input.data = std::get(std::move(result)); - input.table_id = 0; - } - return input; -} - -/** - * @brief Select build/probe sides for deferred input. - */ -BuildProbeConfig select_deferred_build_probe_side( - const JoinNode &join, const DeferredInput &left_input, - const DeferredInput &right_input, +BuildProbeConfig select_join_build_probe_side( + const JoinNode &join, const JoinInput &left_input, + const JoinInput &right_input, const std::vector> &output_attrs) { BuildProbeConfig config; @@ -371,21 +117,21 @@ BuildProbeConfig select_deferred_build_probe_side( } /** - * @brief Unified probe + materialize for deferred path. + * @brief Unified probe + materialize helper templated on collection mode. */ template -DeferredJoinResult execute_deferred_join_with_mode( +JoinResult execute_join_with_mode( bool use_nested_loop, bool probe_is_columnar, bool is_root, - const UnchainedHashtable *hash_table, const DeferredInput &build_input, - const DeferredInput &probe_input, const BuildProbeConfig &config, - const DeferredJoinNode &join_node, io::ColumnarReader &columnar_reader, - const DeferredPlan &deferred_plan, TimingStats &stats) { + const UnchainedHashtable *hash_table, const JoinInput &build_input, + const JoinInput &probe_input, const BuildProbeConfig &config, + const AnalyzedJoinNode &join_node, io::ColumnarReader &columnar_reader, + const AnalyzedPlan &plan, TimingStats &stats) { std::vector> match_buffers; if (use_nested_loop) { auto nested_loop_start = std::chrono::high_resolution_clock::now(); - match_buffers = nested_loop_join_deferred( + match_buffers = nested_loop_join( build_input, probe_input, config.build_attr, config.probe_attr); auto nested_loop_end = std::chrono::high_resolution_clock::now(); stats.nested_loop_join_ms += @@ -395,16 +141,11 @@ DeferredJoinResult execute_deferred_join_with_mode( } else { auto probe_start = std::chrono::high_resolution_clock::now(); if (probe_is_columnar) { - // Create JoinInput for columnar probe - JoinInput probe_ji; - probe_ji.node = probe_input.node; - probe_ji.data = std::get(probe_input.data); - probe_ji.table_id = probe_input.table_id; - match_buffers = - probe_columnar(*hash_table, probe_ji, config.probe_attr); + match_buffers = probe_columnar(*hash_table, probe_input, + config.probe_attr); } else { const auto &probe_result = - std::get(probe_input.data); + std::get(probe_input.data); // Probe using materialized column (should be the join key) const auto *mat_col = probe_result.get_materialized(config.probe_attr); @@ -433,21 +174,21 @@ DeferredJoinResult execute_deferred_join_with_mode( if (is_root) { auto mat_start = std::chrono::high_resolution_clock::now(); - DeferredJoinResult final_result; + JoinResult final_result; if (total_matches == 0) { final_result = - materialize::create_empty_deferred_final(config.remapped_attrs); + materialize::create_empty_result(config.remapped_attrs); } else { // Prepare page indices for final materialization - materialize::prepare_final_deferred_columns( + materialize::prepare_final_columns( columnar_reader, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), config.build_left); - final_result = materialize_deferred_from_buffers( + final_result = materialize_from_buffers( match_buffers, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), - config.build_left, columnar_reader, deferred_plan); + config.build_left, columnar_reader, plan); } auto mat_end = std::chrono::high_resolution_clock::now(); stats.materialize_ms += @@ -457,20 +198,20 @@ DeferredJoinResult execute_deferred_join_with_mode( return final_result; } else { auto inter_start = std::chrono::high_resolution_clock::now(); - DeferredResult result; + IntermediateResult result; if (total_matches > 0) { // Prepare page indices for intermediate construction - materialize::prepare_deferred_columns( + materialize::prepare_intermediate_columns( columnar_reader, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), config.build_left); - construct_deferred_from_buffers( + construct_intermediate_from_buffers( match_buffers, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), - config.build_left, columnar_reader, result, deferred_plan); + config.build_left, columnar_reader, result, plan); } else { - result = create_empty_deferred_result(join_node); + result = create_empty_intermediate_result(join_node); } auto inter_end = std::chrono::high_resolution_clock::now(); stats.intermediate_ms += @@ -482,44 +223,37 @@ DeferredJoinResult execute_deferred_join_with_mode( } /** - * @brief Recursive deferred join execution. + * @brief Recursive join execution. */ -DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, - size_t node_idx, bool is_root, - TimingStats &stats) { - const auto &dnode = deferred_plan[node_idx]; +JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, + TimingStats &stats) { + const auto &anode = plan[node_idx]; - if (std::holds_alternative(dnode)) { - return DeferredResult{}; + if (std::holds_alternative(anode)) { + return IntermediateResult{}; } - const auto &djoin = std::get(dnode); - const auto &plan = *deferred_plan.original_plan; - const auto &pnode = plan.nodes[node_idx]; + const auto &ajoin = std::get(anode); + const auto &original_plan = *plan.original_plan; + const auto &pnode = original_plan.nodes[node_idx]; const auto &join = std::get(pnode.data); // Resolve inputs - DeferredInput left_input = - resolve_deferred_input(deferred_plan, djoin.left_child_idx, stats); - DeferredInput right_input = - resolve_deferred_input(deferred_plan, djoin.right_child_idx, stats); + JoinInput left_input = resolve_input(plan, ajoin.left_child_idx, stats); + JoinInput right_input = resolve_input(plan, ajoin.right_child_idx, stats); // Build/probe selection auto setup_start = std::chrono::high_resolution_clock::now(); - auto config = select_deferred_build_probe_side( - join, left_input, right_input, djoin.output_attrs); - const DeferredInput &build_input = - config.build_left ? left_input : right_input; - const DeferredInput &probe_input = - config.build_left ? right_input : left_input; + auto config = select_join_build_probe_side(join, left_input, right_input, + ajoin.output_attrs); + const JoinInput &build_input = config.build_left ? left_input : right_input; + const JoinInput &probe_input = config.build_left ? right_input : left_input; bool build_is_columnar = build_input.is_columnar(); bool probe_is_columnar = probe_input.is_columnar(); const size_t HASH_TABLE_THRESHOLD = 8; size_t build_rows = build_input.row_count(config.build_attr); - // Use nested loop for small build tables - works with both columnar and - // DeferredResult inputs (join keys are always materialized). bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD); io::ColumnarReader columnar_reader; @@ -530,7 +264,7 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, // Use pre-computed collection mode from plan analysis. // base_collection_mode assumes build=left; flip if build=right at runtime. - MatchCollectionMode mode = djoin.base_collection_mode; + MatchCollectionMode mode = ajoin.base_collection_mode; if (!config.build_left) { if (mode == MatchCollectionMode::LEFT_ONLY) mode = MatchCollectionMode::RIGHT_ONLY; @@ -543,21 +277,16 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, if (!use_nested_loop) { auto build_start = std::chrono::high_resolution_clock::now(); if (build_is_columnar) { - JoinInput build_ji; - build_ji.node = build_input.node; - build_ji.data = std::get(build_input.data); - build_ji.table_id = build_input.table_id; - hash_table = build_from_columnar(build_ji, config.build_attr); + hash_table = build_from_columnar(build_input, config.build_attr); } else { - const auto &dr = std::get(build_input.data); - const auto *mat_col = dr.get_materialized(config.build_attr); + const auto &ir = std::get(build_input.data); + const auto *mat_col = ir.get_materialized(config.build_attr); if (!mat_col) { std::fprintf( stderr, "ERROR: build join key not materialized! build_attr=%zu " "mat_map_size=%zu num_rows=%zu\n", - config.build_attr, dr.materialized_map.size(), dr.num_rows); - // Fatal - this should never happen + config.build_attr, ir.materialized_map.size(), ir.num_rows); std::abort(); } hash_table.emplace(mat_col->row_count()); @@ -573,29 +302,27 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan, // Dispatch based on collection mode switch (mode) { case MatchCollectionMode::BOTH: - return execute_deferred_join_with_mode( + return execute_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, djoin, columnar_reader, deferred_plan, stats); + probe_input, config, ajoin, columnar_reader, plan, stats); case MatchCollectionMode::LEFT_ONLY: - return execute_deferred_join_with_mode( + return execute_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, djoin, columnar_reader, deferred_plan, stats); + probe_input, config, ajoin, columnar_reader, plan, stats); case MatchCollectionMode::RIGHT_ONLY: - return execute_deferred_join_with_mode( + return execute_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, djoin, columnar_reader, deferred_plan, stats); + probe_input, config, ajoin, columnar_reader, plan, stats); } - return DeferredResult{}; + return IntermediateResult{}; } -#endif // USE_DEFERRED_MATERIALIZATION - /** * @brief Public entry point: execute plan from root, return ColumnarTable. * @param plan Query plan with nodes and base tables. @@ -613,26 +340,17 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, TimingStats stats; -#ifdef USE_DEFERRED_MATERIALIZATION - // Deferred materialization path: analyze plan, then execute with deferred - // intermediate construction + // Analyze plan and execute with deferred intermediate construction auto analyze_start = std::chrono::high_resolution_clock::now(); - DeferredPlan deferred_plan = analyze_plan(plan); + AnalyzedPlan analyzed_plan = analyze_plan(plan); auto analyze_end = std::chrono::high_resolution_clock::now(); stats.analyze_plan_ms = std::chrono::duration_cast(analyze_end - analyze_start) .count(); - auto deferred_result = - execute_deferred_impl(deferred_plan, plan.root, true, stats); - ColumnarTable final_result = - std::get(std::move(deferred_result)); -#else - // Eager materialization path (original) - auto result = execute_impl(plan, plan.root, true, stats); + auto result = execute_impl(analyzed_plan, plan.root, true, stats); ColumnarTable final_result = std::get(std::move(result)); -#endif auto total_end = std::chrono::high_resolution_clock::now(); auto total_elapsed = std::chrono::duration_cast( @@ -640,19 +358,13 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, stats.total_execution_ms = total_elapsed.count(); if (show_detailed_timing) { - int64_t accounted = stats.hashtable_build_ms + - stats.hash_join_probe_ms + - stats.nested_loop_join_ms + stats.materialize_ms + - stats.setup_ms + stats.intermediate_ms; -#ifdef USE_DEFERRED_MATERIALIZATION - accounted += stats.analyze_plan_ms; -#endif + int64_t accounted = + stats.hashtable_build_ms + stats.hash_join_probe_ms + + stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms + + stats.intermediate_ms + stats.analyze_plan_ms; int64_t other = stats.total_execution_ms - accounted; -#ifdef USE_DEFERRED_MATERIALIZATION - std::cout << "[DEFERRED] Plan Analysis Time: " << stats.analyze_plan_ms - << " ms\n"; -#endif + std::cout << "Plan Analysis Time: " << stats.analyze_plan_ms << " ms\n"; std::cout << "Hashtable Build Time: " << stats.hashtable_build_ms << " ms\n"; std::cout << "Hash Join Probe Time: " << stats.hash_join_probe_ms From d2bbe70380c222e5c9fa85bcdf1ca23f082acfea Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 14:18:05 +0200 Subject: [PATCH 05/13] feat: deferred tables --- include/data_model/intermediate.h | 188 +++++---- .../materialization/construct_intermediate.h | 374 ++++++++---------- include/materialization/materialize.h | 50 +-- 3 files changed, 302 insertions(+), 310 deletions(-) diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index 4a29919..6a697fa 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -5,7 +5,7 @@ * Provides: * - mema::value_t: 4-byte value encoding (INT32 direct, VARCHAR as page/offset) * - mema::column_t: 16KB-paged column for materialized values - * - mema::deferred_column_t: 32KB-paged column for 64-bit provenance encoding + * - mema::DeferredTable: 16KB-paged 32-bit row ID storage per base table * - IntermediateResult: Lightweight result with selective materialization * - JoinInput: Unified abstraction over columnar tables and intermediate * results @@ -29,10 +29,11 @@ /** * @namespace mema - * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages). + * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages) + + * DeferredTable (32-bit row IDs). * * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated - * pages with write_at(). + * pages with write_at(). DeferredTable: 32-bit row ID storage per base table. * * @see Contest::IntermediateResult, plan.h ColumnarTable. */ @@ -176,84 +177,85 @@ struct column_t { using Columnar = std::vector; /** - * @brief 64-bit provenance column for deferred materialization. + * @brief Per-base-table deferred row ID storage with multi-column tracking. * - * Stores encoded (table_id, column_idx, row_id) for each row using - * DeferredProvenance encoding. Uses 32KB pages with 4096 entries each. + * Stores 32-bit row IDs for a single base table. All columns from this + * base table share the same row ID lookup, reducing memory from 8 bytes + * per column to 4 bytes per table. * - * @see DeferredProvenance for encoding scheme. - * @see IntermediateResult for usage. + * Uses 16KB pages (reuses IR_PAGE arena chunk) with 4096 uint32_t entries. */ -struct deferred_column_t { - static constexpr size_t PAGE_SIZE = 1 << 15; // 32KB +struct DeferredTable { + static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB static constexpr size_t ENTRIES_PER_PAGE = - PAGE_SIZE / sizeof(uint64_t); // 4096 + PAGE_SIZE / sizeof(uint32_t); // 4096 static constexpr size_t ENTRY_SHIFT = 12; // log2(4096) static constexpr size_t ENTRY_MASK = ENTRIES_PER_PAGE - 1; struct alignas(PAGE_SIZE) Page { - uint64_t data[ENTRIES_PER_PAGE]; + uint32_t data[ENTRIES_PER_PAGE]; }; std::vector pages; size_t num_values = 0; - deferred_column_t() = default; + /// Base table ID this deferred table references + uint8_t base_table_id = 0; - deferred_column_t(deferred_column_t &&other) noexcept - : pages(std::move(other.pages)), num_values(other.num_values) { + /// True if this deferred table comes from build side (vs probe) + bool from_build = false; + + /// Column indices from this base table that need deferred resolution + std::vector column_indices; + + DeferredTable() = default; + + DeferredTable(DeferredTable &&other) noexcept + : pages(std::move(other.pages)), num_values(other.num_values), + base_table_id(other.base_table_id), from_build(other.from_build), + column_indices(std::move(other.column_indices)) { other.pages.clear(); other.num_values = 0; } - deferred_column_t &operator=(deferred_column_t &&other) noexcept { + DeferredTable &operator=(DeferredTable &&other) noexcept { if (this != &other) { pages = std::move(other.pages); num_values = other.num_values; + base_table_id = other.base_table_id; + from_build = other.from_build; + column_indices = std::move(other.column_indices); other.pages.clear(); other.num_values = 0; } return *this; } - deferred_column_t(const deferred_column_t &) = delete; - deferred_column_t &operator=(const deferred_column_t &) = delete; + DeferredTable(const DeferredTable &) = delete; + DeferredTable &operator=(const DeferredTable &) = delete; - ~deferred_column_t() = default; + ~DeferredTable() = default; - /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */ - inline uint64_t operator[](size_t idx) const { + /// O(1) read: idx >> 12 for page, idx & 0xFFF for offset + inline uint32_t operator[](size_t idx) const { return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK]; } - /** @brief Thread-safe write at idx (requires pages to be set up first). */ - inline void write_at(size_t idx, uint64_t val) { - pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = val; + /// Thread-safe write at idx (requires pages set up first) + inline void write_at(size_t idx, uint32_t row_id) { + pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = row_id; } - /** @brief Total value count. */ size_t row_count() const { return num_values; } + void set_row_count(size_t count) { num_values = count; } - /** @brief Set row count without allocation (for assembly pattern). */ - inline void set_row_count(size_t count) { num_values = count; } - - /** @brief Pre-allocate pages from arena. */ - inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, - size_t count) { - static_assert( - sizeof(Page) == - Contest::platform::ChunkSize< - Contest::platform::ChunkType::DEFERRED_PAGE>::value, - "Page size mismatch with DEFERRED_PAGE chunk size"); - size_t pages_needed = (count + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE; - pages.reserve(pages_needed); - for (size_t i = 0; i < pages_needed; ++i) { - void *ptr = - arena - .alloc_chunk(); - pages.push_back(reinterpret_cast(ptr)); + /// Check if this table tracks a specific base column + bool has_column(uint8_t col_idx) const { + for (uint8_t c : column_indices) { + if (c == col_idx) + return true; } - num_values = count; + return false; } }; @@ -261,19 +263,28 @@ struct deferred_column_t { namespace Contest { +/** + * @brief Reference from a column to its deferred table. + */ +struct DeferredColumnRef { + uint8_t table_idx; ///< Index into IntermediateResult::deferred_tables + uint8_t base_col; ///< Base column index in Plan::inputs[base_table_id] +}; + /** * @brief Lightweight intermediate result with selective materialization. * * Stores only columns marked MATERIALIZE (typically just the parent's join - * key). All other columns are resolved at final materialization using - * per-column 64-bit provenance (table_id, column_idx, row_id). + * key). Deferred columns use per-table 32-bit row ID storage instead of + * per-column 64-bit provenance, achieving up to 10x memory reduction for + * multi-column deferred scenarios. * - * Memory savings: For a join projecting N columns where only 1 is a join key, - * IntermediateResult uses ~1/N the memory for data columns. Additionally, we - * only track provenance for deferred columns (not all tables). + * Memory savings example: For 5 columns from same base table: + * - Old: 5 columns Ɨ 8 bytes = 40 bytes per row + * - New: 1 DeferredTable Ɨ 4 bytes = 4 bytes per row * * @see AnalyzedColumnInfo for materialization decisions. - * @see DeferredProvenance for 64-bit encoding scheme. + * @see DeferredTable for 32-bit row ID storage. */ struct IntermediateResult { /// Only columns marked MATERIALIZE (typically 1 join key). @@ -283,13 +294,15 @@ struct IntermediateResult { /// deferred). std::vector> materialized_map; - /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row). - /// One deferred_column_t per DEFER column, stores full provenance per row. - std::vector deferred_columns; + /// Per-base-table deferred row ID storage. One DeferredTable per unique + /// (from_build, base_table_id) pair. All columns from same base table share + /// the same row ID lookup. + std::vector deferred_tables; - /// Map: original column index -> index in deferred_columns (nullopt if - /// materialized). - std::vector> deferred_map; + /// Map: original column index -> DeferredColumnRef (nullopt if + /// materialized). The ref contains table_idx (into deferred_tables) and + /// base_col for resolution. + std::vector> deferred_map; /// Reference to node info for column provenance resolution. const AnalyzedJoinNode *node_info = nullptr; @@ -325,22 +338,36 @@ struct IntermediateResult { return &materialized[*materialized_map[orig_idx]]; } - /** @brief Get deferred column provenance, or nullptr if materialized. */ - const mema::deferred_column_t *get_deferred(size_t orig_idx) const { + /** @brief Get deferred table for a column, or nullptr if materialized. */ + const mema::DeferredTable *get_deferred_table(size_t orig_idx) const { if (!is_deferred(orig_idx)) return nullptr; - return &deferred_columns[*deferred_map[orig_idx]]; + return &deferred_tables[deferred_map[orig_idx]->table_idx]; } - /** @brief Get mutable deferred column provenance, or nullptr. */ - mema::deferred_column_t *get_deferred_mut(size_t orig_idx) { + /** @brief Get mutable deferred table for a column, or nullptr. */ + mema::DeferredTable *get_deferred_table_mut(size_t orig_idx) { if (!is_deferred(orig_idx)) return nullptr; - return &deferred_columns[*deferred_map[orig_idx]]; + return &deferred_tables[deferred_map[orig_idx]->table_idx]; } - /** @brief Number of deferred columns. */ - size_t num_deferred() const { return deferred_columns.size(); } + /** @brief Get base column index for deferred column. */ + uint8_t get_deferred_base_col(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return 0; + return deferred_map[orig_idx]->base_col; + } + + /** @brief Get full DeferredColumnRef for a column, or nullptr. */ + const DeferredColumnRef *get_deferred_ref(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return nullptr; + return &(*deferred_map[orig_idx]); + } + + /** @brief Number of deferred tables (unique base tables). */ + size_t num_deferred_tables() const { return deferred_tables.size(); } }; /** @@ -397,15 +424,40 @@ struct JoinInput { } /** - * @brief Get deferred column provenance for a column index. + * @brief Get deferred table for a column index. + * + * For columnar inputs, returns nullptr (caller must encode fresh). + * For IntermediateResult inputs, returns existing deferred table. + */ + const mema::DeferredTable *get_deferred_table(size_t col_idx) const { + if (is_columnar()) + return nullptr; + return std::get(data).get_deferred_table(col_idx); + } + + /** + * @brief Get base column index for a deferred column. + * + * For columnar inputs, returns 0 (caller must use column metadata). + * For IntermediateResult inputs, returns stored base column index. + */ + uint8_t get_deferred_base_col(size_t col_idx) const { + if (is_columnar()) + return 0; + return std::get(data).get_deferred_base_col( + col_idx); + } + + /** + * @brief Get full DeferredColumnRef for a column index. * * For columnar inputs, returns nullptr (caller must encode fresh). - * For IntermediateResult inputs, returns existing provenance column. + * For IntermediateResult inputs, returns pointer to DeferredColumnRef. */ - const mema::deferred_column_t *get_deferred_column(size_t col_idx) const { + const DeferredColumnRef *get_deferred_ref(size_t col_idx) const { if (is_columnar()) return nullptr; - return std::get(data).get_deferred(col_idx); + return std::get(data).get_deferred_ref(col_idx); } }; diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index ed1834d..99c7d69 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -3,13 +3,13 @@ * @brief Constructs intermediate results for multi-way joins. * * Allocates and populates IntermediateResult with only MATERIALIZE columns - * (typically just the parent's join key). Deferred columns store 64-bit - * provenance (table_id, column_idx, row_id) for resolution at final output. + * (typically just the parent's join key). Deferred columns use per-table + * 32-bit row ID storage for memory efficiency. * * Optimized with: * - Column-major iteration for cache locality * - Precomputed source metadata to avoid per-row variant access - * - SIMD provenance encoding (AVX2/NEON) for deferred columns + * - Per-table 32-bit row ID storage (vs per-column 64-bit provenance) * - Batch access to match collector chunks * * @see materialize.h for final resolution of deferred columns. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -27,12 +28,6 @@ #include #include -#if defined(__x86_64__) -#include -#elif defined(__aarch64__) -#include -#endif - namespace Contest { namespace materialize { @@ -43,135 +38,60 @@ using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; // ============================================================================ -// SIMD Provenance Encoding +// Row ID Batch Operations (for 32-bit per-table deferred) // ============================================================================ -namespace simd_provenance { - -#if defined(__x86_64__) && defined(__AVX2__) -inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit) -#elif defined(__aarch64__) -inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit) -#else -inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available -#endif +namespace row_id_ops { /** - * @brief Encode provenance for batch of row IDs using SIMD. - * - * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row. - * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback. + * @brief Write row IDs directly from columnar input. * - * @param dest Destination deferred column - * @param start_idx Starting output index - * @param row_ids Pointer to row IDs (from IndexChunk, contiguous) - * @param count Number of row IDs to process - * @param table_id Base table ID (constant for all rows) - * @param column_idx Base column index (constant for all rows) - * @return Number of rows processed (always == count) + * For columnar inputs, we just write the row_id directly (it's already + * the base table row ID). */ -inline size_t encode_provenance_batch(mema::deferred_column_t &dest, - size_t start_idx, const uint32_t *row_ids, - size_t count, uint8_t table_id, - uint8_t column_idx) { - // Precompute constant prefix: (table_id << 56) | (column_idx << 48) - const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0); - - size_t i = 0; - -#if defined(__x86_64__) && defined(__AVX2__) - // AVX2: Process 4 x uint64_t at a time - // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix - const __m256i prefix_vec = _mm256_set1_epi64x(static_cast(prefix)); - - for (; i + 4 <= count; i += 4) { - // Load 4 x uint32_t and zero-extend to 4 x uint64_t - __m128i rows_32 = - _mm_loadu_si128(reinterpret_cast(row_ids + i)); - __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32); - - // OR with prefix to create provenance values - __m256i result = _mm256_or_si256(rows_64, prefix_vec); - - // Store to aligned buffer, then write individually (page-safe) - alignas(32) uint64_t out[4]; - _mm256_store_si256(reinterpret_cast<__m256i *>(out), result); - - dest.write_at(start_idx + i, out[0]); - dest.write_at(start_idx + i + 1, out[1]); - dest.write_at(start_idx + i + 2, out[2]); - dest.write_at(start_idx + i + 3, out[3]); - } -#elif defined(__aarch64__) - // NEON: Process 2 x uint64_t at a time - const uint64x2_t prefix_vec = vdupq_n_u64(prefix); - - for (; i + 2 <= count; i += 2) { - // Load 2 x uint32_t and zero-extend to 2 x uint64_t - uint32x2_t rows_32 = vld1_u32(row_ids + i); - uint64x2_t rows_64 = vmovl_u32(rows_32); - - // OR with prefix - uint64x2_t result = vorrq_u64(rows_64, prefix_vec); - - // Store individually (page boundary safe) - dest.write_at(start_idx + i, vgetq_lane_u64(result, 0)); - dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1)); - } -#endif - - // Scalar remainder - for (; i < count; ++i) { - dest.write_at(start_idx + i, - prefix | static_cast(row_ids[i])); +inline size_t write_row_ids_direct(mema::DeferredTable &dest, size_t start_idx, + const uint32_t *row_ids, size_t count) { + for (size_t i = 0; i < count; ++i) { + dest.write_at(start_idx + i, row_ids[i]); } - return count; } /** - * @brief Copy provenance from source column using batch reads. - * - * Copies existing 64-bit provenance values from child intermediate. - * Uses contiguous batch access for better cache behavior. + * @brief Copy row IDs from child deferred table. * - * @param dest Destination deferred column - * @param start_idx Starting output index - * @param src Source deferred column (from child) - * @param row_ids Row indices into source column - * @param count Number of rows to copy - * @return Number of rows processed (always == count) + * For intermediate inputs, we look up the base table row ID from the + * child's deferred table and copy it to the parent's deferred table. */ -inline size_t copy_provenance_batch(mema::deferred_column_t &dest, - size_t start_idx, - const mema::deferred_column_t &src, - const uint32_t *row_ids, size_t count) { +inline size_t copy_row_ids_from_child(mema::DeferredTable &dest, + size_t start_idx, + const mema::DeferredTable &src, + const uint32_t *row_ids, size_t count) { for (size_t i = 0; i < count; ++i) { dest.write_at(start_idx + i, src[row_ids[i]]); } return count; } -} // namespace simd_provenance +} // namespace row_id_ops // ============================================================================ // Source Precomputation Structures // ============================================================================ /** - * @brief Precomputed metadata for deferred column sources. + * @brief Precomputed metadata for a deferred table source. * - * Tracks where each deferred column's provenance comes from: - * - For columnar inputs: encode fresh (table_id, column_idx, row_id) - * - For IntermediateResult inputs: copy existing provenance from child + * Groups columns by (from_build, base_table_id) so we only store 32-bit + * row IDs once per unique base table instead of 64-bit provenance per column. */ -struct DeferredColumnSource { - const mema::deferred_column_t *source_col = - nullptr; ///< Source if from intermediate. - uint8_t base_table_id = 0; ///< Base table ID for encoding. - uint8_t base_column_idx = 0; ///< Base column index for encoding. - bool from_build = false; ///< True if from build side. - bool needs_encode = false; ///< True if columnar (needs fresh encode). +struct DeferredTableSource { + const mema::DeferredTable *child_table = + nullptr; ///< Source deferred table from child (if any). + uint8_t base_table_id = 0; ///< Base table ID. + uint8_t dest_table_idx = 0; ///< Index in result.deferred_tables[]. + bool from_build = false; ///< True if from build side. + bool needs_direct = false; ///< True if columnar (write row IDs directly). }; /** @@ -183,8 +103,8 @@ struct alignas(8) MaterializedColumnSource { const mema::column_t *intermediate_col = nullptr; ///< Source if from IntermediateResult materialized const Column *columnar_col = nullptr; ///< Source if from ColumnarTable - const mema::deferred_column_t *deferred_resolve_col = - nullptr; ///< Source if needs deferred resolution + const mema::DeferredTable *deferred_table = + nullptr; ///< Source deferred table if needs resolution size_t child_output_idx = 0; ///< Index in child's output size_t mat_col_idx = 0; ///< Index in result.materialized[] DataType type = DataType::INT32; @@ -294,57 +214,93 @@ create_empty_intermediate_result(const AnalyzedJoinNode &node) { result.deferred_map.resize(node.columns.size(), std::nullopt); size_t mat_count = 0; - size_t def_count = 0; for (const auto &col : node.columns) { if (col.resolution == ColumnResolution::MATERIALIZE) { result.materialized_map[col.original_idx] = mat_count++; - } else { - result.deferred_map[col.original_idx] = def_count++; } + // For empty result, we don't need to set up deferred tables } result.materialized.resize(mat_count); - result.deferred_columns.resize(def_count); return result; } /** - * @brief Prepare deferred column sources for intermediate construction. + * @brief Prepare deferred table sources for intermediate construction. + * + * Groups deferred columns by (from_build, base_table_id) to create + * DeferredTable entries. Returns list of sources for populating the tables. */ -inline std::vector -prepare_deferred_sources(const AnalyzedJoinNode &join_node, - const JoinInput &build_input, - const JoinInput &probe_input, bool build_is_left) { - std::vector sources; - sources.reserve(join_node.num_deferred_columns); +inline std::vector +prepare_deferred_table_sources(const AnalyzedJoinNode &join_node, + const JoinInput &build_input, + const JoinInput &probe_input, bool build_is_left, + IntermediateResult &out_result) { + // Map from (from_build << 8 | base_table_id) -> dest_table_idx + std::unordered_map table_key_to_idx; + std::vector sources; for (const auto &col : join_node.columns) { if (col.resolution != ColumnResolution::DEFER) continue; - DeferredColumnSource src; - src.base_table_id = col.provenance.base_table_id; - src.base_column_idx = col.provenance.base_column_idx; - src.from_build = (col.from_left == build_is_left); - - const auto &src_input = src.from_build ? build_input : probe_input; - - if (src_input.is_columnar()) { - src.needs_encode = true; - src.source_col = nullptr; - } else { - const auto *child_def = - src_input.get_deferred_column(col.child_output_idx); - if (child_def) { - src.needs_encode = false; - src.source_col = child_def; + bool from_build = (col.from_left == build_is_left); + uint16_t key = (static_cast(from_build) << 8) | + col.provenance.base_table_id; + + auto it = table_key_to_idx.find(key); + uint8_t dest_idx; + + if (it == table_key_to_idx.end()) { + // New deferred table needed + dest_idx = static_cast(out_result.deferred_tables.size()); + table_key_to_idx[key] = dest_idx; + + mema::DeferredTable dt; + dt.base_table_id = col.provenance.base_table_id; + dt.from_build = from_build; + out_result.deferred_tables.push_back(std::move(dt)); + + // Create source entry + DeferredTableSource src; + src.base_table_id = col.provenance.base_table_id; + src.dest_table_idx = dest_idx; + src.from_build = from_build; + + const auto &src_input = from_build ? build_input : probe_input; + if (src_input.is_columnar()) { + src.needs_direct = true; + src.child_table = nullptr; } else { - src.needs_encode = true; - src.source_col = nullptr; + // Find child's deferred table for this base table + const auto *child_ref = + src_input.get_deferred_ref(col.child_output_idx); + if (child_ref) { + src.needs_direct = false; + src.child_table = + src_input.get_deferred_table(col.child_output_idx); + } else { + // Child materialized this, shouldn't happen for DEFER cols + src.needs_direct = true; + src.child_table = nullptr; + } } + sources.push_back(src); + } else { + dest_idx = it->second; } - sources.push_back(src); + + // Add column to deferred table's column list + out_result.deferred_tables[dest_idx].column_indices.push_back( + col.provenance.base_column_idx); + + // Set up deferred_map entry + DeferredColumnRef ref; + ref.table_idx = dest_idx; + ref.base_col = col.provenance.base_column_idx; + out_result.deferred_map[col.original_idx] = ref; } + return sources; } @@ -391,8 +347,9 @@ prepare_materialized_sources(const AnalyzedJoinNode &join_node, ir.get_materialized(col.child_output_idx); } else if (ir.is_deferred(col.child_output_idx)) { src.needs_deferred_resolve = true; - src.deferred_resolve_col = - ir.get_deferred(col.child_output_idx); + src.deferred_table = + ir.get_deferred_table(col.child_output_idx); + // base_column_idx is already set from col.provenance } } sources.push_back(src); @@ -408,10 +365,9 @@ prepare_materialized_sources(const AnalyzedJoinNode &join_node, /** * @brief Constructs intermediate result from thread-local buffers. * - * Optimized with column-major iteration and SIMD provenance encoding. + * Optimized with column-major iteration and per-table 32-bit row ID storage. * Only materializes columns marked MATERIALIZE in the AnalyzedJoinNode. - * Deferred columns store 64-bit provenance encoding for resolution at final - * output. + * Deferred columns share row ID storage per unique base table. * * @tparam Mode Collection mode for compile-time specialization. * @param buffers Thread-local match buffers from probe. @@ -454,30 +410,31 @@ void construct_intermediate_from_buffers( out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); + // Count materialized columns and set up maps size_t mat_count = 0; - size_t def_count = 0; for (const auto &col : join_node.columns) { if (col.resolution == ColumnResolution::MATERIALIZE) { out_result.materialized_map[col.original_idx] = mat_count++; - } else { - out_result.deferred_map[col.original_idx] = def_count++; } } - // Precompute sources for column-major iteration + // Prepare deferred table sources (this populates deferred_tables and + // deferred_map) + auto deferred_sources = prepare_deferred_table_sources( + join_node, build_input, probe_input, build_is_left, out_result); + + // Precompute materialized sources auto mat_sources = prepare_materialized_sources(join_node, build_input, probe_input, build_is_left); - auto deferred_sources = prepare_deferred_sources( - join_node, build_input, probe_input, build_is_left); // Pre-allocate pages using Page = mema::column_t::Page; - using DeferredPage = mema::deferred_column_t::Page; + using DeferredPage = mema::DeferredTable::Page; size_t mat_pages_needed = (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; size_t def_pages_needed = - (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) / - mema::deferred_column_t::ENTRIES_PER_PAGE; + (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) / + mema::DeferredTable::ENTRIES_PER_PAGE; out_result.materialized.resize(mat_count); for (size_t c = 0; c < mat_count; ++c) { @@ -485,10 +442,9 @@ void construct_intermediate_from_buffers( out_result.materialized[c].set_row_count(total_matches); } - out_result.deferred_columns.resize(def_count); - for (size_t d = 0; d < def_count; ++d) { - out_result.deferred_columns[d].pages.resize(def_pages_needed); - out_result.deferred_columns[d].set_row_count(total_matches); + for (auto &dt : out_result.deferred_tables) { + dt.pages.resize(def_pages_needed); + dt.set_row_count(total_matches); } // Set source metadata for materialized columns @@ -500,6 +456,7 @@ void construct_intermediate_from_buffers( } const size_t num_threads = THREAD_COUNT; + const size_t num_deferred_tables = out_result.deferred_tables.size(); // Parallel page allocation worker_pool().execute([&](size_t t) { @@ -512,14 +469,14 @@ void construct_intermediate_from_buffers( col.pages[p] = reinterpret_cast(ptr); } } - for (size_t d = 0; d < def_count; ++d) { - auto &def_col = out_result.deferred_columns[d]; + for (size_t d = 0; d < num_deferred_tables; ++d) { + auto &dt = out_result.deferred_tables[d]; for (size_t p = t; p < def_pages_needed; p += num_threads) { + // Use IR_PAGE (16KB) for DeferredTable pages void *ptr = Contest::platform::get_arena(t) - .alloc_chunk< - Contest::platform::ChunkType::DEFERRED_PAGE>(); - def_col.pages[p] = reinterpret_cast(ptr); + .alloc_chunk(); + dt.pages[p] = reinterpret_cast(ptr); } } }); @@ -564,23 +521,21 @@ void construct_intermediate_from_buffers( for (uint32_t rid : range) { dest_col.write_at(k++, vec[rid]); } - } else if (src.needs_deferred_resolve && src.deferred_resolve_col) { - // Deferred in child - resolve via provenance - const auto &def_col = *src.deferred_resolve_col; + } else if (src.needs_deferred_resolve && src.deferred_table) { + // Deferred in child - resolve via deferred table + base table + const auto &def_table = *src.deferred_table; size_t k = start; for (uint32_t rid : range) { - uint64_t prov = def_col[rid]; - uint8_t base_tid = DeferredProvenance::table(prov); - uint8_t base_col = DeferredProvenance::column(prov); - uint64_t base_row = DeferredProvenance::row(prov); + uint32_t base_row = def_table[rid]; if (analyzed_plan.original_plan) [[likely]] { const auto &base_table = - analyzed_plan.original_plan->inputs[base_tid]; + analyzed_plan.original_plan + ->inputs[src.base_table_id]; mema::value_t val = columnar_reader.read_value_direct_public( - base_table.columns[base_col], - static_cast(base_row), src.type); + base_table.columns[src.base_column_idx], + base_row, src.type); dest_col.write_at(k++, val); } else { dest_col.write_at( @@ -591,52 +546,33 @@ void construct_intermediate_from_buffers( } // ==================================================================== - // Process DEFERRED columns (column-major with SIMD batch encoding) + // Process DEFERRED tables (one pass per unique base table) // ==================================================================== - for (size_t d = 0; d < deferred_sources.size(); ++d) { - const auto &def_src = deferred_sources[d]; - auto &dest_def_col = out_result.deferred_columns[d]; - - if (def_src.needs_encode) { - // Fresh encoding from columnar input - use SIMD batch - auto batch_reader = def_src.from_build - ? buf.left_batch_reader() - : buf.right_batch_reader(); - - size_t k = start; - while (batch_reader.has_more()) { - size_t batch_count; - // Request larger batches for SIMD efficiency - constexpr size_t MAX_BATCH = - simd_provenance::BATCH_SIZE > 0 ? 64 : 256; - const uint32_t *row_ids = - batch_reader.get_batch(MAX_BATCH, batch_count); - - if (batch_count > 0) { - simd_provenance::encode_provenance_batch( - dest_def_col, k, row_ids, batch_count, - def_src.base_table_id, def_src.base_column_idx); - k += batch_count; - } - } - } else if (def_src.source_col) { - // Copy existing provenance from child intermediate - auto batch_reader = def_src.from_build - ? buf.left_batch_reader() - : buf.right_batch_reader(); - - size_t k = start; - while (batch_reader.has_more()) { - size_t batch_count; - const uint32_t *row_ids = - batch_reader.get_batch(256, batch_count); - - if (batch_count > 0) { - simd_provenance::copy_provenance_batch( - dest_def_col, k, *def_src.source_col, row_ids, + for (const auto &def_src : deferred_sources) { + auto &dest_table = + out_result.deferred_tables[def_src.dest_table_idx]; + + auto batch_reader = def_src.from_build ? buf.left_batch_reader() + : buf.right_batch_reader(); + + size_t k = start; + while (batch_reader.has_more()) { + size_t batch_count; + const uint32_t *row_ids = + batch_reader.get_batch(256, batch_count); + + if (batch_count > 0) { + if (def_src.needs_direct) { + // Columnar input: write row IDs directly + row_id_ops::write_row_ids_direct(dest_table, k, row_ids, + batch_count); + } else if (def_src.child_table) { + // Intermediate input: copy from child's deferred table + row_id_ops::copy_row_ids_from_child( + dest_table, k, *def_src.child_table, row_ids, batch_count); - k += batch_count; } + k += batch_count; } } } diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h index dca7f49..4425042 100644 --- a/include/materialization/materialize.h +++ b/include/materialization/materialize.h @@ -3,8 +3,7 @@ * @brief Final materialization for execution path. * * Materializes all output columns at the root join, resolving deferred - * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back - * to base tables. + * columns by looking up 32-bit row IDs in DeferredTable back to base tables. * * @see construct_intermediate.h for building IntermediateResult intermediates. */ @@ -248,7 +247,7 @@ inline void materialize_column( /** * @brief Materialize single output column handling deferred resolution. * - * For deferred columns, resolves via 64-bit provenance encoding back to + * For deferred columns, resolves via DeferredTable (32-bit row ID) back to * base table. * * @tparam Mode Collection mode for compile-time specialization. @@ -282,7 +281,9 @@ inline void materialize_single_column( // Determine how to read the value const Column *columnar_source = nullptr; const mema::column_t *materialized_source = nullptr; - const mema::deferred_column_t *deferred_source = nullptr; + const mema::DeferredTable *deferred_table = nullptr; + uint8_t deferred_base_col = 0; + uint8_t deferred_base_table = 0; if (src_input.is_columnar()) { // Direct columnar read @@ -297,8 +298,14 @@ inline void materialize_single_column( materialized_source = ir.get_materialized(col_info->child_output_idx); } else if (ir.is_deferred(col_info->child_output_idx)) { - // Deferred - need to resolve via 64-bit provenance - deferred_source = ir.get_deferred(col_info->child_output_idx); + // Deferred - need to resolve via deferred table + base table + deferred_table = ir.get_deferred_table(col_info->child_output_idx); + deferred_base_col = + ir.get_deferred_base_col(col_info->child_output_idx); + // Get base table ID from the deferred table itself + if (deferred_table) { + deferred_base_table = deferred_table->base_table_id; + } } } @@ -311,17 +318,15 @@ inline void materialize_single_column( col_info->type, cursor, from_build); } else if (materialized_source) { return (*materialized_source)[local_row_id]; - } else if (deferred_source && analyzed_plan.original_plan) { - // Deferred resolution: decode 64-bit provenance - uint64_t prov = (*deferred_source)[local_row_id]; - uint8_t base_tid = DeferredProvenance::table(prov); - uint8_t base_col = DeferredProvenance::column(prov); - uint64_t base_row = DeferredProvenance::row(prov); + } else if (deferred_table && analyzed_plan.original_plan) { + // Deferred resolution: look up base table row ID from deferred + // table + uint32_t base_row = (*deferred_table)[local_row_id]; const auto &base_table = - analyzed_plan.original_plan->inputs[base_tid]; + analyzed_plan.original_plan->inputs[deferred_base_table]; return columnar_reader.read_value( - base_table.columns[base_col], base_col, - static_cast(base_row), col_info->type, cursor, true); + base_table.columns[deferred_base_col], deferred_base_col, + base_row, col_info->type, cursor, true); } return mema::value_t{mema::value_t::NULL_VALUE}; }; @@ -347,12 +352,11 @@ inline void materialize_single_column( str_src_ptr = &analyzed_plan.original_plan ->inputs[materialized_source->source_table] .columns[materialized_source->source_column]; - } else if (deferred_source && analyzed_plan.original_plan) { - // For deferred VARCHAR, get source from provenance of first row - // All rows in a deferred column share the same base table/column - str_src_ptr = &analyzed_plan.original_plan - ->inputs[col_info->provenance.base_table_id] - .columns[col_info->provenance.base_column_idx]; + } else if (deferred_table && analyzed_plan.original_plan) { + // For deferred VARCHAR, get source from provenance metadata + str_src_ptr = + &analyzed_plan.original_plan->inputs[deferred_base_table] + .columns[deferred_base_col]; } } @@ -376,8 +380,8 @@ inline void materialize_single_column( /** * @brief Materialize all output columns from intermediate result. * - * For root join. Resolves all deferred columns by decoding 64-bit provenance - * to base tables. + * For root join. Resolves all deferred columns by looking up 32-bit row IDs + * in DeferredTable back to base tables. * * @tparam Mode Collection mode for compile-time specialization. * @param buffers Thread-local match buffers from probe. From 61e0f2fea5fbf921b5628daed1f8294128c6aa39 Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 17:54:38 +0200 Subject: [PATCH 06/13] feat: draft propagation --- include/data_model/intermediate.h | 163 +++++- include/join_execution/hash_join.h | 70 +++ include/join_execution/hashtable.h | 81 +++ include/join_execution/join_setup.h | 59 +++ include/join_execution/nested_loop.h | 52 +- .../materialization/construct_intermediate.h | 468 +++++++++++++++++- include/materialization/materialize.h | 20 +- src/execute.cpp | 83 +++- 8 files changed, 943 insertions(+), 53 deletions(-) diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index 6a697fa..53f6418 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -30,15 +30,128 @@ /** * @namespace mema * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages) + - * DeferredTable (32-bit row IDs). + * DeferredTable (32-bit row IDs) + key_row_column_t (8B tuples). * * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated * pages with write_at(). DeferredTable: 32-bit row ID storage per base table. + * key_row_column_t: (key, row_id) tuples for join key propagation. * * @see Contest::IntermediateResult, plan.h ColumnarTable. */ namespace mema { +/** + * @brief Join key with associated row ID for tuple-based storage. + * + * For LEFT_ONLY/RIGHT_ONLY modes: row_id is base table row ID (zero + * indirection) For BOTH mode: row_id may be IR index (requires deferred table + * lookup) + * + * 8-byte aligned for efficient memory access and potential SIMD operations. + */ +struct alignas(8) KeyRowPair { + int32_t key; ///< Join key value + uint32_t row_id; ///< Row ID (base table or IR index depending on mode) +}; + +/** + * @brief Column of (key, row_id) tuples for join key storage. + * + * Enables accelerated hashtable build (tuples match internal format) and + * zero-indirection row ID propagation through join chains. Used instead of + * separate column_t for join key columns. + * + * Memory layout: 16KB pages containing 2048 KeyRowPair entries each. + */ +struct key_row_column_t { + static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB + static constexpr size_t PAIRS_PER_PAGE = + PAGE_SIZE / sizeof(KeyRowPair); // 2048 + static constexpr size_t ENTRY_SHIFT = 11; // log2(2048) + static constexpr size_t ENTRY_MASK = PAIRS_PER_PAGE - 1; + + struct alignas(PAGE_SIZE) Page { + KeyRowPair data[PAIRS_PER_PAGE]; + }; + + std::vector pages; + size_t num_values = 0; + + /// Base table ID for row_id component (valid when stores_base_row_ids=true) + uint8_t base_table_id = 0; + + /// Source column in base table (for VARCHAR provenance) + uint8_t source_column = 0; + + /// True if row_id contains base table row IDs, false if IR indices + bool stores_base_row_ids = false; + + key_row_column_t() = default; + + key_row_column_t(key_row_column_t &&other) noexcept + : pages(std::move(other.pages)), num_values(other.num_values), + base_table_id(other.base_table_id), + source_column(other.source_column), + stores_base_row_ids(other.stores_base_row_ids) { + other.pages.clear(); + other.num_values = 0; + } + + key_row_column_t &operator=(key_row_column_t &&other) noexcept { + if (this != &other) { + pages = std::move(other.pages); + num_values = other.num_values; + base_table_id = other.base_table_id; + source_column = other.source_column; + stores_base_row_ids = other.stores_base_row_ids; + other.pages.clear(); + other.num_values = 0; + } + return *this; + } + + key_row_column_t(const key_row_column_t &) = delete; + key_row_column_t &operator=(const key_row_column_t &) = delete; + + ~key_row_column_t() = default; + + /// O(1) read: idx >> 11 for page, idx & 0x7FF for offset + inline KeyRowPair operator[](size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK]; + } + + /// Thread-safe write at idx (requires pages set up first) + inline void write_at(size_t idx, KeyRowPair pair) { + pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = pair; + } + + /// Read only the key at index + inline int32_t key_at(size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].key; + } + + /// Read only the row_id at index + inline uint32_t row_id_at(size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].row_id; + } + + size_t row_count() const { return num_values; } + void set_row_count(size_t count) { num_values = count; } + + /// Pre-allocate pages from arena + inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, + size_t count) { + size_t pages_needed = (count + PAIRS_PER_PAGE - 1) / PAIRS_PER_PAGE; + pages.reserve(pages_needed); + for (size_t i = 0; i < pages_needed; ++i) { + void *ptr = + arena.alloc_chunk(); + pages.push_back(reinterpret_cast(ptr)); + } + num_values = count; + } +}; + /** * @brief 4-byte value: INT32 direct, VARCHAR packed (19-bit page + 13-bit * offset). @@ -274,29 +387,36 @@ struct DeferredColumnRef { /** * @brief Lightweight intermediate result with selective materialization. * - * Stores only columns marked MATERIALIZE (typically just the parent's join - * key). Deferred columns use per-table 32-bit row ID storage instead of - * per-column 64-bit provenance, achieving up to 10x memory reduction for - * multi-column deferred scenarios. + * Stores join key as (value, row_id) tuples for accelerated hashtable build + * and zero-indirection row ID propagation. Other columns use per-table 32-bit + * row ID storage for deferred resolution. * - * Memory savings example: For 5 columns from same base table: - * - Old: 5 columns Ɨ 8 bytes = 40 bytes per row - * - New: 1 DeferredTable Ɨ 4 bytes = 4 bytes per row + * For LEFT_ONLY/RIGHT_ONLY modes: join_key_tuples stores base table row IDs + * For BOTH mode: join_key_tuples may store IR indices + DeferredTable for other + * side * * @see AnalyzedColumnInfo for materialization decisions. + * @see key_row_column_t for tuple storage. * @see DeferredTable for 32-bit row ID storage. */ struct IntermediateResult { - /// Only columns marked MATERIALIZE (typically 1 join key). + /// Join key stored as (value, row_id) tuples for accelerated propagation. + /// Replaces materialized column for join key when present. + std::optional join_key_tuples; + + /// Index of join key column in output (nullopt if root or no tuples). + std::optional join_key_idx; + + /// Other materialized columns (non-join-key columns marked MATERIALIZE). std::vector materialized; /// Map: original column index -> index in materialized (nullopt if - /// deferred). + /// deferred or is join key). std::vector> materialized_map; /// Per-base-table deferred row ID storage. One DeferredTable per unique /// (from_build, base_table_id) pair. All columns from same base table share - /// the same row ID lookup. + /// the same row ID lookup. Used for BOTH mode's non-tracked side. std::vector deferred_tables; /// Map: original column index -> DeferredColumnRef (nullopt if @@ -319,19 +439,38 @@ struct IntermediateResult { /** @brief Total row count. */ size_t row_count() const { return num_rows; } + /** @brief Check if join key is stored as tuples. */ + bool has_join_key_tuples() const { return join_key_tuples.has_value(); } + + /** @brief Check if join key tuples contain base row IDs (vs IR indices). */ + bool join_key_has_base_rows() const { + return join_key_tuples && join_key_tuples->stores_base_row_ids; + } + + /** @brief Get join key tuple at index. */ + mema::KeyRowPair get_join_key_tuple(size_t idx) const { + return join_key_tuples ? (*join_key_tuples)[idx] + : mema::KeyRowPair{0, 0}; + } + /** @brief Check if column was materialized (not deferred). */ bool is_materialized(size_t orig_idx) const { return orig_idx < materialized_map.size() && materialized_map[orig_idx].has_value(); } + /** @brief Check if column is the join key (stored as tuples). */ + bool is_join_key(size_t orig_idx) const { + return join_key_idx.has_value() && *join_key_idx == orig_idx; + } + /** @brief Check if column is deferred. */ bool is_deferred(size_t orig_idx) const { return orig_idx < deferred_map.size() && deferred_map[orig_idx].has_value(); } - /** @brief Get materialized column, or nullptr if deferred. */ + /** @brief Get materialized column, or nullptr if deferred/join key. */ const mema::column_t *get_materialized(size_t orig_idx) const { if (!is_materialized(orig_idx)) return nullptr; diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h index 0e75ccf..5895f11 100644 --- a/include/join_execution/hash_join.h +++ b/include/join_execution/hash_join.h @@ -236,4 +236,74 @@ probe_columnar(const UnchainedHashtable &hash_table, return local_buffers; } +/** + * @brief Probe hash table with tuple column, returning thread-local buffers. + * + * Uses (key, row_id) tuples from IntermediateResult. The row_id in each + * tuple is propagated to the match buffer, enabling zero-indirection + * resolution when tuples contain base table row IDs. + * + * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY) for compile-time + * specialization of match buffer operations. + * @param hash_table Hash table to probe against. + * @param probe_tuples Tuple column containing (key, row_id) pairs. + * @return Vector of thread-local match buffers. + */ +template +inline std::vector> +probe_tuples(const UnchainedHashtable &hash_table, + const mema::key_row_column_t &probe_tuples) { + + const auto *keys = hash_table.keys(); + const auto *row_ids = hash_table.row_ids(); + const size_t probe_count = probe_tuples.row_count(); + const size_t num_pages = probe_tuples.pages.size(); + + std::vector> local_buffers(THREAD_COUNT); + std::atomic page_counter(0); + + worker_pool().execute([&](size_t thread_id) { + local_buffers[thread_id] = ThreadLocalMatchBuffer( + Contest::platform::get_arena(thread_id)); + auto &local_buf = local_buffers[thread_id]; + + while (true) { + size_t page_idx = page_counter.fetch_add(1); + if (page_idx >= num_pages) + break; + + size_t base = page_idx * mema::key_row_column_t::PAIRS_PER_PAGE; + size_t end = std::min(base + mema::key_row_column_t::PAIRS_PER_PAGE, + probe_count); + + constexpr size_t PREFETCH_DIST = 8; + for (size_t idx = base; idx < end; ++idx) { + // Prefetch future slot + if (idx + PREFETCH_DIST < end) { + hash_table.prefetch_slot( + probe_tuples.key_at(idx + PREFETCH_DIST)); + } + + mema::KeyRowPair pair = probe_tuples[idx]; + + // Skip NULL keys + if (pair.key != mema::value_t::NULL_VALUE) { + auto [start_idx, end_idx] = + hash_table.find_indices(pair.key); + + for (uint64_t i = start_idx; i < end_idx; ++i) { + if (keys[i] == pair.key) { + // row_ids[i] = build side's row ID (base or IR) + // pair.row_id = probe side's row ID (base or IR) + local_buf.add_match(row_ids[i], pair.row_id); + } + } + } + } + } + }); + + return local_buffers; +} + } // namespace Contest::join diff --git a/include/join_execution/hashtable.h b/include/join_execution/hashtable.h index f98ea18..1ea10da 100644 --- a/include/join_execution/hashtable.h +++ b/include/join_execution/hashtable.h @@ -390,6 +390,87 @@ class UnchainedHashtable { }); } + /** + * @brief Build hash table from (key, row_id) tuple column. + * + * Consumes tuples directly - row_ids are already in correct format + * (base table IDs or IR indices depending on how IR was constructed). + * More efficient than build_intermediate() as tuples match internal format. + * + * @param tuples Key-row tuple column from IntermediateResult. + * @param num_threads Thread count hint. + */ + void build_from_tuples(const mema::key_row_column_t &tuples, + int num_threads = 4) { + const size_t row_count = tuples.row_count(); + if (row_count == 0) + return; + + static constexpr size_t PARALLEL_BUILD_THRESHOLD = 10000; + num_threads = Contest::platform::worker_pool().thread_count(); + if (row_count < PARALLEL_BUILD_THRESHOLD) + num_threads = 1; + + const size_t num_slots = directory.size(); + const size_t num_partitions = + compute_num_partitions(row_count, num_threads); + const int partition_bits = __builtin_ctzll(num_partitions); + const size_t slots_per_partition = num_slots / num_partitions; + + std::vector allocators(num_threads); + for (int t = 0; t < num_threads; ++t) + allocators[t].set_arena(Contest::platform::get_arena(t)); + std::vector> thread_parts(num_threads); + for (auto &tp : thread_parts) + tp.resize(num_partitions); + + // Partition phase - 8-byte tuple reads, cache-friendly streaming + size_t batch = (row_count + num_threads - 1) / num_threads; + Contest::platform::worker_pool().execute([&, partition_bits](size_t t) { + size_t start = t * batch; + size_t end = std::min(start + batch, row_count); + if (start >= end) + return; + const int shift = 64 - partition_bits; + for (size_t i = start; i < end; ++i) { + mema::KeyRowPair pair = tuples[i]; + uint64_t h = hash_key(pair.key); + size_t p = (partition_bits == 0) ? 0 : (h >> shift); + // Direct use of tuple - no conversion needed + thread_parts[t][p].append(allocators[t], + {pair.key, pair.row_id}); + } + }); + + // Compute global offsets from per-thread counts + Contest::platform::ArenaVector global_offsets(*arena_); + global_offsets.resize(num_partitions + 1); + std::memset(global_offsets.data(), 0, + (num_partitions + 1) * sizeof(size_t)); + for (size_t p = 0; p < num_partitions; ++p) { + for (size_t t = 0; t < num_threads; ++t) { + global_offsets[p + 1] += thread_parts[t][p].total_count; + } + global_offsets[p + 1] += global_offsets[p]; + } + + size_t total = global_offsets[num_partitions]; + if (total == 0) + return; + keys_.resize(total); + row_ids_.resize(total); + + // Build partitions in parallel + const int nt = num_threads; + Contest::platform::worker_pool().execute([&, nt](size_t t) { + for (size_t p = t; p < num_partitions; p += nt) { + build_partition( + thread_parts, p, slots_per_partition, global_offsets[p], + global_offsets[p + 1] - global_offsets[p], nt, t); + } + }); + } + /** * @brief Build hash table from ColumnarTable Column. * diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h index 217995e..f2917f0 100644 --- a/include/join_execution/join_setup.h +++ b/include/join_execution/join_setup.h @@ -107,3 +107,62 @@ inline MatchCollectionMode determine_collection_mode( } } // namespace Contest::join + +namespace Contest { + +// Forward declare AnalyzedJoinNode +struct AnalyzedJoinNode; + +/** + * @brief Tracking info for one side of a join (build or probe). + * + * Determines whether to embed base table row IDs or IR indices in the + * output tuples for this side. + */ +struct SideTrackingInfo { + bool track_base_rows = + false; ///< True to embed base row IDs, false for IR indices + uint8_t base_table_id = 0; ///< Base table to track (if track_base_rows) +}; + +/** + * @brief Tracking configuration for intermediate construction. + * + * Determines what row IDs to embed in join key tuples and whether + * DeferredTables are needed for non-tracked sides. + */ +struct TupleTrackingInfo { + SideTrackingInfo build_tracking; ///< Tracking info for build side + SideTrackingInfo probe_tracking; ///< Tracking info for probe side + bool key_from_build = + true; ///< True if parent join key comes from build side +}; + +/** + * @brief Result of a join execution before intermediate construction. + * + * Contains match buffers and metadata needed for deferred IR construction. + * Allows parent join to decide row ID format based on its cardinality + * requirements before constructing the intermediate result. + * + * @tparam Mode Match collection mode for this join's buffers. + */ +template struct MatchResult { + std::vector> buffers; + size_t total_count = 0; + + /// The inputs that were joined (for resolving row IDs during IR + /// construction) + JoinInput build_input; + JoinInput probe_input; + + /// Join configuration + const AnalyzedJoinNode *join_node = nullptr; + join::BuildProbeConfig config; + + /// Convenience accessors + size_t count() const { return total_count; } + bool empty() const { return total_count == 0; } +}; + +} // namespace Contest diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h index c99a8be..d836409 100644 --- a/include/join_execution/nested_loop.h +++ b/include/join_execution/nested_loop.h @@ -34,8 +34,8 @@ using Contest::platform::worker_pool; * @brief Iterates over non-NULL values in a join input column. * * Abstracts columnar vs intermediate input. Handles NULL bitmaps. - * For IntermediateResult, reads from materialized columns (join keys are - * always materialized). + * For IntermediateResult, reads from join_key_tuples if available, + * otherwise from materialized columns (join keys are always available). * * @tparam Func void(uint32_t row_id, int32_t value). */ @@ -70,10 +70,25 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx, } } else { const auto &res = std::get(input.data); - // Join key must be materialized + + // Check if join key is stored as tuples + if (res.has_join_key_tuples() && res.join_key_idx.has_value() && + *res.join_key_idx == attr_idx) { + const auto &tuples = *res.join_key_tuples; + size_t count = tuples.row_count(); + for (size_t i = 0; i < count; i++) { + mema::KeyRowPair pair = tuples[i]; + if (pair.key != mema::value_t::NULL_VALUE) { + visitor(static_cast(i), pair.key); + } + } + return; + } + + // Fall back to materialized column const mema::column_t *col = res.get_materialized(attr_idx); if (!col) - return; // Should not happen - join keys are always materialized + return; // Should not happen - join keys are always available size_t count = col->row_count(); for (size_t i = 0; i < count; i++) { const mema::value_t &val = (*col)[i]; @@ -145,13 +160,21 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, page_offsets.push_back(current); } - // Setup for IntermediateResult probe + // Setup for IntermediateResult probe - check tuples first, then + // materialized const mema::column_t *probe_mat_col = nullptr; + const mema::key_row_column_t *probe_tuples = nullptr; if (!probe_input.is_columnar()) { const auto &res = std::get(probe_input.data); - probe_mat_col = res.get_materialized(probe_attr); - if (!probe_mat_col) - return {}; // Join key not materialized - should not happen + // Check if join key is stored as tuples + if (res.has_join_key_tuples() && res.join_key_idx.has_value() && + *res.join_key_idx == probe_attr) { + probe_tuples = &(*res.join_key_tuples); + } else { + probe_mat_col = res.get_materialized(probe_attr); + if (!probe_mat_col) + return {}; // Join key not available - should not happen + } } std::atomic probe_page_counter{0}; @@ -203,6 +226,19 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, } } } + } else if (probe_tuples) { + // IntermediateResult probe - use tuple column + const mema::key_row_column_t &tuples = *probe_tuples; + size_t count = tuples.row_count(); + size_t start = (t_id * count) / THREAD_COUNT; + size_t end = ((t_id + 1) * count) / THREAD_COUNT; + + for (size_t i = start; i < end; i++) { + mema::KeyRowPair pair = tuples[i]; + if (pair.key != mema::value_t::NULL_VALUE) { + process_value(static_cast(i), pair.key); + } + } } else { // IntermediateResult probe - use materialized column const mema::column_t &col = *probe_mat_col; diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index 99c7d69..a5431aa 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -104,7 +105,9 @@ struct alignas(8) MaterializedColumnSource { nullptr; ///< Source if from IntermediateResult materialized const Column *columnar_col = nullptr; ///< Source if from ColumnarTable const mema::DeferredTable *deferred_table = - nullptr; ///< Source deferred table if needs resolution + nullptr; ///< Source deferred table if needs resolution + const mema::key_row_column_t *tuple_col = + nullptr; ///< Source if from child's join_key_tuples size_t child_output_idx = 0; ///< Index in child's output size_t mat_col_idx = 0; ///< Index in result.materialized[] DataType type = DataType::INT32; @@ -113,6 +116,7 @@ struct alignas(8) MaterializedColumnSource { bool is_columnar = false; ///< True if source is ColumnarTable bool from_build = false; ///< True if from build side bool needs_deferred_resolve = false; ///< True if child deferred this column + bool needs_tuple_key_read = false; ///< True if reading key from tuples }; // ============================================================================ @@ -272,6 +276,8 @@ prepare_deferred_table_sources(const AnalyzedJoinNode &join_node, src.needs_direct = true; src.child_table = nullptr; } else { + const auto &child_ir = + std::get(src_input.data); // Find child's deferred table for this base table const auto *child_ref = src_input.get_deferred_ref(col.child_output_idx); @@ -279,6 +285,17 @@ prepare_deferred_table_sources(const AnalyzedJoinNode &join_node, src.needs_direct = false; src.child_table = src_input.get_deferred_table(col.child_output_idx); + } else if (child_ir.is_join_key(col.child_output_idx)) { + // Child stored this as tuples - the row_id in tuples + // is an IR index, but we need base table row IDs for + // deferred resolution. This shouldn't happen if the + // join key column is properly excluded from DEFER. + std::fprintf(stderr, + "[BUG] DEFER column %zu is child's " + "join key - this is unexpected!\n", + col.child_output_idx); + src.needs_direct = true; + src.child_table = nullptr; } else { // Child materialized this, shouldn't happen for DEFER cols src.needs_direct = true; @@ -342,7 +359,15 @@ prepare_materialized_sources(const AnalyzedJoinNode &join_node, src.is_columnar = false; const auto &ir = std::get(src_input.data); - if (ir.is_materialized(col.child_output_idx)) { + // Check source type in priority order: + // 1. Tuples (join key stored as key-row pairs) + // 2. Materialized column + // 3. Deferred table + if (ir.is_join_key(col.child_output_idx)) { + // Child stored this column as tuples - read key from there + src.needs_tuple_key_read = true; + src.tuple_col = &(*ir.join_key_tuples); + } else if (ir.is_materialized(col.child_output_idx)) { src.intermediate_col = ir.get_materialized(col.child_output_idx); } else if (ir.is_deferred(col.child_output_idx)) { @@ -514,6 +539,14 @@ void construct_intermediate_from_buffers( col, src.child_output_idx, rid, src.type, cursor, src.from_build)); } + } else if (src.needs_tuple_key_read && src.tuple_col) { + // Child stored this column as tuples - read key from there + const auto &tuples = *src.tuple_col; + size_t k = start; + for (uint32_t rid : range) { + int32_t key = tuples.key_at(rid); + dest_col.write_at(k++, mema::value_t{key}); + } } else if (src.intermediate_col) { // Intermediate materialized source - direct copy const auto &vec = *src.intermediate_col; @@ -579,5 +612,436 @@ void construct_intermediate_from_buffers( }); } +// ============================================================================ +// Tuple-Based Intermediate Construction +// ============================================================================ + +/** + * @brief Resolves a row ID to base table row ID if possible. + * + * For columnar inputs: row ID is already base row ID (direct). + * For IR with tuples storing base rows: lookup via key_row_column_t. + * For IR with tuples storing IR indices: lookup via deferred table. + * For IR without tuples: lookup via deferred table. + * + * @param input The JoinInput to resolve from. + * @param row_id The row ID from match buffer. + * @param key_col_idx The join key column index in input's output. + * @return Resolved base table row ID. + */ +inline uint32_t resolve_to_base_row(const JoinInput &input, uint32_t row_id, + size_t key_col_idx) { + if (input.is_columnar()) { + // Columnar input: row ID is already base table row + return row_id; + } + + const auto &ir = std::get(input.data); + + if (ir.has_join_key_tuples() && ir.join_key_has_base_rows()) { + // IR stores base row IDs in tuples - one lookup + return ir.join_key_tuples->row_id_at(row_id); + } + + // IR stores IR indices - need deferred table lookup + const auto *def_table = ir.get_deferred_table(key_col_idx); + if (def_table) { + return (*def_table)[row_id]; + } + + // Fallback: return as-is (shouldn't happen for correct plans) + return row_id; +} + +/** + * @brief Populates join key tuples column from match buffers. + * + * Extracts join keys and resolves row IDs based on tracking configuration. + * For tracked side with base rows, embeds base table row IDs directly. + * For non-tracked side, embeds IR indices for later DeferredTable lookup. + * + * @tparam Mode Match collection mode. + * @param buffers Thread-local match buffers. + * @param buffer_starts Per-buffer write offsets. + * @param build_input Build side input. + * @param probe_input Probe side input. + * @param key_from_build True if parent's join key comes from build side. + * @param key_child_output_idx Column index in the key input's output. + * @param out_tuples Output tuple column (pre-allocated). + * @param columnar_reader Reader for columnar access. + */ +template +void populate_join_key_tuples( + std::vector> &buffers, + const std::vector &buffer_starts, const JoinInput &build_input, + const JoinInput &probe_input, bool key_from_build, + size_t key_child_output_idx, mema::key_row_column_t &out_tuples, + ColumnarReader &columnar_reader) { + + const JoinInput &key_input = key_from_build ? build_input : probe_input; + size_t key_attr = key_child_output_idx; + + worker_pool().execute([&](size_t t) { + if (t >= buffers.size()) + return; + auto &buf = buffers[t]; + size_t my_count = buf.count(); + if (my_count == 0) + return; + + size_t write_pos = buffer_starts[t]; + + // Get the appropriate range based on which side provides the key + auto range = key_from_build ? buf.left_range() : buf.right_range(); + + if (key_input.is_columnar()) { + // Columnar source - read key from base table, but store OUTPUT IR + // index (write_pos) so parent can use it to index into this IR + auto *table = std::get(key_input.data); + auto [actual_col_idx, _] = key_input.node->output_attrs[key_attr]; + const Column &col = table->columns[actual_col_idx]; + + for (uint32_t row_id : range) { + // Use read_value_direct_public since page indices may not be + // prepared for the join key column (it's stored in tuples, + // not as a MATERIALIZE column) + int32_t key = + columnar_reader + .read_value_direct_public(col, row_id, DataType::INT32) + .value; + // Store OUTPUT IR index (write_pos), not base table row_id + // Parent needs IR index to access other columns in this IR + uint32_t output_ir_idx = static_cast(write_pos); + out_tuples.write_at(write_pos++, {key, output_ir_idx}); + } + } else { + // Intermediate source - store OUTPUT IR index + const auto &ir = std::get(key_input.data); + + // Only propagate existing tuples if they contain the column we need + // Otherwise, read from materialized column + if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() && + *ir.join_key_idx == key_attr) { + // IR's tuples contain the column we need - propagate directly + const auto &src_tuples = *ir.join_key_tuples; + + for (uint32_t ir_idx : range) { + mema::KeyRowPair src = src_tuples[ir_idx]; + // Store OUTPUT IR index for parent to index into this IR + uint32_t output_ir_idx = static_cast(write_pos); + out_tuples.write_at(write_pos++, {src.key, output_ir_idx}); + } + } else { + // IR's tuples contain a different column, or no tuples exist + // Read from materialized column instead + const auto *mat_col = ir.get_materialized(key_attr); + if (mat_col) { + for (uint32_t ir_idx : range) { + int32_t key = (*mat_col)[ir_idx].value; + // Store OUTPUT IR index for parent to index into this + // IR + uint32_t output_ir_idx = + static_cast(write_pos); + out_tuples.write_at(write_pos++, {key, output_ir_idx}); + } + } + } + } + }); +} + +/** + * @brief Constructs intermediate result with tuple-based join key storage. + * + * Stores join key as (value, row_id) tuples for accelerated hashtable build + * and zero-indirection row ID propagation. Other columns handled normally + * via deferred tables or materialization. + * + * @tparam Mode Collection mode for compile-time specialization. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side data source. + * @param probe_input Probe side data source. + * @param join_node Analyzed join node with materialization decisions. + * @param config Build/probe configuration. + * @param build_is_left True if build side is the original left child. + * @param parent_key_idx Index of column that will be parent's join key. + * @param columnar_reader Reader for columnar data access. + * @param out_result Output IntermediateResult (populated in-place). + * @param analyzed_plan Full analyzed plan for base table access. + */ +template +void construct_intermediate_with_tuples( + std::vector> &buffers, + const JoinInput &build_input, const JoinInput &probe_input, + const AnalyzedJoinNode &join_node, const join::BuildProbeConfig &config, + bool build_is_left, size_t parent_key_idx, ColumnarReader &columnar_reader, + IntermediateResult &out_result, const AnalyzedPlan &analyzed_plan) { + + // Count total matches and compute buffer start offsets + size_t total_matches = 0; + std::vector buffer_starts(buffers.size()); + for (size_t i = 0; i < buffers.size(); ++i) { + buffer_starts[i] = total_matches; + total_matches += buffers[i].count(); + } + + if (total_matches == 0) { + out_result = create_empty_intermediate_result(join_node); + return; + } + + // Initialize result metadata + out_result.node_info = &join_node; + out_result.num_rows = total_matches; + out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); + out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); + + // Determine if parent's join key comes from build or probe side + // and which base table it traces back to + bool key_from_build = true; + size_t key_child_output_idx = 0; // Column index in child's output + uint8_t key_base_table_id = 0; + uint8_t key_base_column = 0; + + for (const auto &col : join_node.columns) { + if (col.original_idx == parent_key_idx) { + key_from_build = (col.from_left == build_is_left); + key_child_output_idx = col.child_output_idx; + key_base_table_id = col.provenance.base_table_id; + key_base_column = col.provenance.base_column_idx; + break; + } + } + + // Allocate join key tuples column + out_result.join_key_tuples.emplace(); + out_result.join_key_tuples->pre_allocate_from_arena( + Contest::platform::get_arena(0), total_matches); + out_result.join_key_tuples->base_table_id = key_base_table_id; + out_result.join_key_tuples->source_column = key_base_column; + // Always store OUTPUT IR indices (not base row IDs) so parent can + // index into this IR to access deferred columns + out_result.join_key_tuples->stores_base_row_ids = false; + out_result.join_key_idx = parent_key_idx; + const JoinInput &key_input = key_from_build ? build_input : probe_input; + (void)key_input; // Used in populate_join_key_tuples + + // Count non-join-key materialized columns and set up maps + size_t mat_count = 0; + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE && + col.original_idx != parent_key_idx) { + out_result.materialized_map[col.original_idx] = mat_count++; + } + } + + // Prepare deferred table sources (unchanged from non-tuple version) + auto deferred_sources = prepare_deferred_table_sources( + join_node, build_input, probe_input, build_is_left, out_result); + + // Precompute materialized sources (excluding join key) + std::vector mat_sources; + mat_sources.reserve(join_node.columns.size()); + size_t mat_idx = 0; + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + if (col.original_idx == parent_key_idx) + continue; // Skip join key - handled via tuples + + MaterializedColumnSource src; + src.mat_col_idx = mat_idx++; + src.child_output_idx = col.child_output_idx; + src.type = col.type; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.is_columnar = true; + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col.child_output_idx]; + src.columnar_col = &table->columns[actual_idx]; + } else { + src.is_columnar = false; + const auto &ir = std::get(src_input.data); + + // Check source type in priority order: + // 1. Tuples (join key stored as key-row pairs) + // 2. Materialized column + // 3. Deferred table + if (ir.is_join_key(col.child_output_idx)) { + // Child stored this column as tuples - read key from there + src.needs_tuple_key_read = true; + src.tuple_col = &(*ir.join_key_tuples); + } else if (ir.is_materialized(col.child_output_idx)) { + src.intermediate_col = + ir.get_materialized(col.child_output_idx); + } else if (ir.is_deferred(col.child_output_idx)) { + src.needs_deferred_resolve = true; + src.deferred_table = + ir.get_deferred_table(col.child_output_idx); + } + } + mat_sources.push_back(src); + } + + // Pre-allocate pages + using Page = mema::column_t::Page; + using DeferredPage = mema::DeferredTable::Page; + size_t mat_pages_needed = + (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; + size_t def_pages_needed = + (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) / + mema::DeferredTable::ENTRIES_PER_PAGE; + + out_result.materialized.resize(mat_count); + for (size_t c = 0; c < mat_count; ++c) { + out_result.materialized[c].pages.resize(mat_pages_needed); + out_result.materialized[c].set_row_count(total_matches); + } + + for (auto &dt : out_result.deferred_tables) { + dt.pages.resize(def_pages_needed); + dt.set_row_count(total_matches); + } + + // Set source metadata for materialized columns + for (const auto &src : mat_sources) { + out_result.materialized[src.mat_col_idx].source_table = + src.base_table_id; + out_result.materialized[src.mat_col_idx].source_column = + src.base_column_idx; + } + + const size_t num_threads = THREAD_COUNT; + const size_t num_deferred_tables = out_result.deferred_tables.size(); + + // Parallel page allocation + worker_pool().execute([&](size_t t) { + for (size_t c = 0; c < mat_count; ++c) { + auto &col = out_result.materialized[c]; + for (size_t p = t; p < mat_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + col.pages[p] = reinterpret_cast(ptr); + } + } + for (size_t d = 0; d < num_deferred_tables; ++d) { + auto &dt = out_result.deferred_tables[d]; + for (size_t p = t; p < def_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + dt.pages[p] = reinterpret_cast(ptr); + } + } + }); + + // Populate join key tuples + populate_join_key_tuples( + buffers, buffer_starts, build_input, probe_input, key_from_build, + key_child_output_idx, *out_result.join_key_tuples, columnar_reader); + + // Populate other materialized columns and deferred tables + // (same logic as construct_intermediate_from_buffers) + worker_pool().execute([&](size_t t) { + if (t >= buffers.size()) + return; + auto &buf = buffers[t]; + size_t my_count = buf.count(); + if (my_count == 0) + return; + + size_t start = buffer_starts[t]; + ColumnarReader::Cursor cursor; + + // Process MATERIALIZED columns (excluding join key) + for (const auto &src : mat_sources) { + auto &dest_col = out_result.materialized[src.mat_col_idx]; + + auto range = src.from_build ? buf.left_range() : buf.right_range(); + + if (src.is_columnar) { + const auto &col = *src.columnar_col; + size_t k = start; + for (uint32_t rid : range) { + mema::value_t val = columnar_reader.read_value( + col, src.child_output_idx, rid, src.type, cursor, + src.from_build); + dest_col.write_at(k++, val); + } + } else if (src.needs_tuple_key_read && src.tuple_col) { + // Child stored this column as tuples - read key from there + const auto &tuples = *src.tuple_col; + size_t k = start; + for (uint32_t rid : range) { + int32_t key = tuples.key_at(rid); + dest_col.write_at(k++, mema::value_t{key}); + } + } else if (src.intermediate_col) { + const auto &vec = *src.intermediate_col; + size_t k = start; + for (uint32_t rid : range) { + mema::value_t val = vec[rid]; + dest_col.write_at(k++, val); + } + } else if (src.needs_deferred_resolve && src.deferred_table) { + const auto &def_table = *src.deferred_table; + size_t k = start; + for (uint32_t rid : range) { + uint32_t base_row = def_table[rid]; + + if (analyzed_plan.original_plan) [[likely]] { + const auto &base_table = + analyzed_plan.original_plan + ->inputs[src.base_table_id]; + mema::value_t val = + columnar_reader.read_value_direct_public( + base_table.columns[src.base_column_idx], + base_row, src.type); + dest_col.write_at(k++, val); + } else { + dest_col.write_at( + k++, mema::value_t{mema::value_t::NULL_VALUE}); + } + } + } + } + + // Process DEFERRED tables + for (const auto &def_src : deferred_sources) { + auto &dest_table = + out_result.deferred_tables[def_src.dest_table_idx]; + + auto batch_reader = def_src.from_build ? buf.left_batch_reader() + : buf.right_batch_reader(); + + size_t k = start; + while (batch_reader.has_more()) { + size_t batch_count; + const uint32_t *row_ids = + batch_reader.get_batch(256, batch_count); + + if (batch_count > 0) { + if (def_src.needs_direct) { + row_id_ops::write_row_ids_direct(dest_table, k, row_ids, + batch_count); + } else if (def_src.child_table) { + row_id_ops::copy_row_ids_from_child( + dest_table, k, *def_src.child_table, row_ids, + batch_count); + } + k += batch_count; + } + } + } + }); +} + } // namespace materialize } // namespace Contest diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h index 4425042..4cff7ab 100644 --- a/include/materialization/materialize.h +++ b/include/materialization/materialize.h @@ -281,6 +281,7 @@ inline void materialize_single_column( // Determine how to read the value const Column *columnar_source = nullptr; const mema::column_t *materialized_source = nullptr; + const mema::key_row_column_t *tuple_source = nullptr; const mema::DeferredTable *deferred_table = nullptr; uint8_t deferred_base_col = 0; uint8_t deferred_base_table = 0; @@ -293,7 +294,10 @@ inline void materialize_single_column( columnar_source = &table->columns[actual_idx]; } else { const auto &ir = std::get(src_input.data); - if (ir.is_materialized(col_info->child_output_idx)) { + // Check if column is stored as join key tuples + if (ir.is_join_key(col_info->child_output_idx)) { + tuple_source = &(*ir.join_key_tuples); + } else if (ir.is_materialized(col_info->child_output_idx)) { // Read from materialized column materialized_source = ir.get_materialized(col_info->child_output_idx); @@ -312,23 +316,29 @@ inline void materialize_single_column( // Create reader lambda auto reader = [&](uint32_t local_row_id, ColumnarReader::Cursor &cursor) -> mema::value_t { + mema::value_t result; if (columnar_source) { - return columnar_reader.read_value( + result = columnar_reader.read_value( *columnar_source, col_info->child_output_idx, local_row_id, col_info->type, cursor, from_build); + } else if (tuple_source) { + // Read key value from tuple column + result = mema::value_t{tuple_source->key_at(local_row_id)}; } else if (materialized_source) { - return (*materialized_source)[local_row_id]; + result = (*materialized_source)[local_row_id]; } else if (deferred_table && analyzed_plan.original_plan) { // Deferred resolution: look up base table row ID from deferred // table uint32_t base_row = (*deferred_table)[local_row_id]; const auto &base_table = analyzed_plan.original_plan->inputs[deferred_base_table]; - return columnar_reader.read_value( + result = columnar_reader.read_value( base_table.columns[deferred_base_col], deferred_base_col, base_row, col_info->type, cursor, true); + } else { + result = mema::value_t{mema::value_t::NULL_VALUE}; } - return mema::value_t{mema::value_t::NULL_VALUE}; + return result; }; // Materialize based on type diff --git a/src/execute.cpp b/src/execute.cpp index 29a485c..0a9e1fc 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -46,6 +46,7 @@ namespace Contest { using namespace join; using materialize::construct_intermediate_from_buffers; +using materialize::construct_intermediate_with_tuples; using materialize::create_empty_intermediate_result; using materialize::materialize_from_buffers; @@ -146,19 +147,30 @@ JoinResult execute_join_with_mode( } else { const auto &probe_result = std::get(probe_input.data); - // Probe using materialized column (should be the join key) - const auto *mat_col = - probe_result.get_materialized(config.probe_attr); - if (!mat_col) { - std::fprintf( - stderr, - "ERROR: probe join key not materialized! probe_attr=%zu " - "mat_map_size=%zu num_rows=%zu\n", - config.probe_attr, probe_result.materialized_map.size(), - probe_result.num_rows); - std::abort(); + + // Use tuple-based probe if available + if (probe_result.has_join_key_tuples() && + probe_result.join_key_idx.has_value() && + *probe_result.join_key_idx == config.probe_attr) { + match_buffers = probe_tuples( + *hash_table, *probe_result.join_key_tuples); + } else { + // Fall back to materialized column probe + const auto *mat_col = + probe_result.get_materialized(config.probe_attr); + if (!mat_col) { + std::fprintf( + stderr, + "ERROR: probe join key not materialized! " + "probe_attr=%zu " + "mat_map_size=%zu num_rows=%zu has_tuples=%d\n", + config.probe_attr, probe_result.materialized_map.size(), + probe_result.num_rows, + probe_result.has_join_key_tuples() ? 1 : 0); + std::abort(); + } + match_buffers = probe_intermediate(*hash_table, *mat_col); } - match_buffers = probe_intermediate(*hash_table, *mat_col); } auto probe_end = std::chrono::high_resolution_clock::now(); stats.hash_join_probe_ms += @@ -206,10 +218,18 @@ JoinResult execute_join_with_mode( config.remapped_attrs, build_input.output_size(), config.build_left); - construct_intermediate_from_buffers( - match_buffers, build_input, probe_input, join_node, - config.remapped_attrs, build_input.output_size(), - config.build_left, columnar_reader, result, plan); + // Use tuple-based construction if parent needs a join key + if (join_node.parent_join_key_idx.has_value()) { + construct_intermediate_with_tuples( + match_buffers, build_input, probe_input, join_node, config, + config.build_left, *join_node.parent_join_key_idx, + columnar_reader, result, plan); + } else { + construct_intermediate_from_buffers( + match_buffers, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left, columnar_reader, result, plan); + } } else { result = create_empty_intermediate_result(join_node); } @@ -280,17 +300,28 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, hash_table = build_from_columnar(build_input, config.build_attr); } else { const auto &ir = std::get(build_input.data); - const auto *mat_col = ir.get_materialized(config.build_attr); - if (!mat_col) { - std::fprintf( - stderr, - "ERROR: build join key not materialized! build_attr=%zu " - "mat_map_size=%zu num_rows=%zu\n", - config.build_attr, ir.materialized_map.size(), ir.num_rows); - std::abort(); + + // Use tuple-based build if available and matches build_attr + if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() && + *ir.join_key_idx == config.build_attr) { + hash_table.emplace(ir.join_key_tuples->row_count()); + hash_table->build_from_tuples(*ir.join_key_tuples); + } else { + // Fall back to materialized column build + const auto *mat_col = ir.get_materialized(config.build_attr); + if (!mat_col) { + std::fprintf( + stderr, + "ERROR: build join key not materialized! " + "build_attr=%zu " + "mat_map_size=%zu num_rows=%zu has_tuples=%d\n", + config.build_attr, ir.materialized_map.size(), + ir.num_rows, ir.has_join_key_tuples() ? 1 : 0); + std::abort(); + } + hash_table.emplace(mat_col->row_count()); + hash_table->build_intermediate(*mat_col); } - hash_table.emplace(mat_col->row_count()); - hash_table->build_intermediate(*mat_col); } auto build_end = std::chrono::high_resolution_clock::now(); stats.hashtable_build_ms += From ddae72eb6082f61d8675c09e8350be0a74628421 Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 18:19:23 +0200 Subject: [PATCH 07/13] fix: columnar reader initialization --- .../materialization/construct_intermediate.h | 37 +++++++++++++++---- src/execute.cpp | 4 +- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index a5431aa..f65218a 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -154,12 +154,15 @@ collect_input_columns(const JoinInput &input, * @brief Prepare ColumnarReader for intermediate construction. * * Sets up page indices for columns that need to be read from columnar inputs. + * If parent_key_idx is provided, also prepares the join key column for tuple + * population. */ inline void prepare_intermediate_columns( ColumnarReader &reader, const JoinInput &build_input, const JoinInput &probe_input, const AnalyzedJoinNode &join_node, const std::vector> &remapped_attrs, - size_t build_size, bool build_is_left) { + size_t build_size, bool build_is_left, + std::optional parent_key_idx = std::nullopt) { bool build_is_columnar = build_input.is_columnar(); bool probe_is_columnar = probe_input.is_columnar(); @@ -195,6 +198,23 @@ inline void prepare_intermediate_columns( } } + // If parent needs a join key via tuples, mark that column as needed too + // This ensures page indices are prepared for efficient tuple population + if (parent_key_idx.has_value()) { + for (const auto &col : join_node.columns) { + if (col.original_idx == *parent_key_idx) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && + col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + break; + } + } + } + if (build_is_columnar) { reader.prepare_build( collect_input_columns(build_input, build_needed, arena)); @@ -695,19 +715,22 @@ void populate_join_key_tuples( auto range = key_from_build ? buf.left_range() : buf.right_range(); if (key_input.is_columnar()) { - // Columnar source - read key from base table, but store OUTPUT IR - // index (write_pos) so parent can use it to index into this IR + // Columnar source - read key from base table using prepared page + // index Store OUTPUT IR index (write_pos) so parent can use it to + // index into this IR auto *table = std::get(key_input.data); auto [actual_col_idx, _] = key_input.node->output_attrs[key_attr]; const Column &col = table->columns[actual_col_idx]; + // Use cursor for efficient sequential/near-sequential access + ColumnarReader::Cursor cursor; for (uint32_t row_id : range) { - // Use read_value_direct_public since page indices may not be - // prepared for the join key column (it's stored in tuples, - // not as a MATERIALIZE column) + // Use read_value with prepared page index (O(1) amortized) + // instead of read_value_direct_public (O(n) per read) int32_t key = columnar_reader - .read_value_direct_public(col, row_id, DataType::INT32) + .read_value(col, key_attr, row_id, DataType::INT32, + cursor, key_from_build) .value; // Store OUTPUT IR index (write_pos), not base table row_id // Parent needs IR index to access other columns in this IR diff --git a/src/execute.cpp b/src/execute.cpp index 0a9e1fc..1cb0e10 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -213,10 +213,12 @@ JoinResult execute_join_with_mode( IntermediateResult result; if (total_matches > 0) { // Prepare page indices for intermediate construction + // Pass parent_join_key_idx so the key column is prepared for tuple + // population materialize::prepare_intermediate_columns( columnar_reader, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), - config.build_left); + config.build_left, join_node.parent_join_key_idx); // Use tuple-based construction if parent needs a join key if (join_node.parent_join_key_idx.has_value()) { From e2abed6497d0f74689e2e181790ce81b749f3d20 Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 18:54:21 +0200 Subject: [PATCH 08/13] feat: improvements --- include/data_access/columnar_reader.h | 162 ++++++++++++++++++ .../materialization/construct_intermediate.h | 81 ++++++++- 2 files changed, 238 insertions(+), 5 deletions(-) diff --git a/include/data_access/columnar_reader.h b/include/data_access/columnar_reader.h index e143c95..39348d0 100644 --- a/include/data_access/columnar_reader.h +++ b/include/data_access/columnar_reader.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -266,6 +267,127 @@ class ColumnarReader { global_probe_version.fetch_add(1, std::memory_order_relaxed); } + // ======================================================================== + // Base Table Page Index Methods (for O(1) deferred column resolution) + // ======================================================================== + + /** @brief Reset base table prepared flags for new query. */ + inline void reset_base_tables() { + base_table_prepared_.fill(false); + base_table_version_++; + } + + /** + * @brief Prepare page index for a base table column. + * + * Called once per unique (table_id, col_idx) before deferred resolution. + * Enables O(log P) page lookup instead of O(P) linear scan per read. + * + * @param table_id Base table ID (0-15). + * @param col_idx Column index within base table (0-15). + * @param column The Column to build page index for. + */ + inline void prepare_base_column(uint8_t table_id, uint8_t col_idx, + const Column &column) { + size_t idx = (static_cast(table_id) << BASE_TABLE_SHIFT) | + static_cast(col_idx); + if (idx >= MAX_BASE_TABLE_INDICES) + return; + + if (!base_table_prepared_[idx]) { + auto &arena = platform::get_arena(0); + base_table_indices_[idx] = PageIndex(arena); + base_table_indices_[idx].build(column); + base_table_prepared_[idx] = true; + } + } + + /** @brief Check if base column page index is prepared. */ + inline bool is_base_column_prepared(uint8_t table_id, + uint8_t col_idx) const { + size_t idx = (static_cast(table_id) << BASE_TABLE_SHIFT) | + static_cast(col_idx); + return idx < MAX_BASE_TABLE_INDICES && base_table_prepared_[idx]; + } + + /** + * @brief Read value from base table using prepared page index. + * + * O(1) with cursor caching for sequential access, O(log P) on cache miss. + * Falls back to O(P) linear scan if page index not prepared. + * + * @param column The base table column. + * @param table_id Base table ID. + * @param col_idx Column index within base table. + * @param row_id Row ID within the column. + * @param data_type Data type of the column. + * @param cursor Thread-local cursor for caching. + * @return The value at the specified row. + */ + inline mema::value_t read_base_table_value(const Column &column, + uint8_t table_id, + uint8_t col_idx, uint32_t row_id, + DataType data_type, + Cursor &cursor) const { + size_t idx = (static_cast(table_id) << BASE_TABLE_SHIFT) | + static_cast(col_idx); + + if (idx >= MAX_BASE_TABLE_INDICES || !base_table_prepared_[idx]) { + // Fallback to O(P) linear scan + return read_value_direct(column, row_id, data_type); + } + + const PageIndex &page_index = base_table_indices_[idx]; + + // Dense INT32 fast path: O(1) arithmetic lookup + if (data_type == DataType::INT32 && page_index.is_dense_int32) { + return mema::value_t{read_dense_int32(page_index, row_id)}; + } + + // Check cursor cache (version uses base_table_version_ + idx for + // uniqueness) + uint64_t effective_version = base_table_version_ + idx; + bool cache_hit = + cursor.version == effective_version && cursor.cached_col == idx && + row_id >= cursor.cached_start && row_id < cursor.cached_end; + + if (!cache_hit) { + // Check sequential access optimization + if (cursor.version == effective_version && + cursor.cached_col == idx && row_id == cursor.cached_end) { + size_t next_page = cursor.cached_page + 1; + if (next_page < page_index.cumulative_rows.size()) { + load_page_into_cursor_base(column, page_index, next_page, + idx, effective_version, cursor); + } else { + // Past end of data + return mema::value_t{mema::value_t::NULL_VALUE}; + } + } else { + // Binary search for page + size_t page_num = page_index.find_page(row_id); + load_page_into_cursor_base(column, page_index, page_num, idx, + effective_version, cursor); + } + } + + // Now cursor is loaded for the correct page + uint32_t local_row = row_id - cursor.cached_start; + if (SPC_LIKELY(cursor.is_dense)) { + if (data_type == DataType::INT32) { + return mema::value_t{cursor.data_ptr[local_row]}; + } else { + return mema::value_t::encode_string( + cursor.page_idx_val, static_cast(local_row)); + } + } + if (SPC_UNLIKELY(cursor.is_special)) { + return mema::value_t::encode_string( + cursor.page_idx_val, mema::value_t::LONG_STRING_OFFSET); + } + return read_sparse(local_row, data_type, cursor); + } + /** @brief Fast path: check cursor cache, dispatch to appropriate handler. */ template @@ -560,8 +682,48 @@ class ColumnarReader { static_cast(data_idx)); } } + + /** @brief Load page into cursor for base table access. */ + inline void load_page_into_cursor_base(const Column &column, + const PageIndex &page_index, + size_t page_num, size_t col_idx, + uint64_t version, + Cursor &cursor) const { + cursor.version = version; + cursor.cached_col = col_idx; + cursor.cached_page = page_num; + cursor.cached_start = page_index.page_start_row(page_num); + cursor.cached_end = page_index.cumulative_rows[page_num]; + cursor.page_idx_val = static_cast(page_num); + cursor.col_all_dense = page_index.all_dense; + + auto *page_data = column.pages[page_num]->data; + auto num_rows = *reinterpret_cast(page_data); + auto num_values = *reinterpret_cast(page_data + 2); + + cursor.is_special = (num_rows == 0xffff); + cursor.is_dense = (num_rows == num_values); + cursor.data_ptr = reinterpret_cast(page_data + 4); + + if (!cursor.is_dense && !cursor.is_special) { + size_t bitmap_size = (num_rows + 7) / 8; + cursor.bitmap_ptr = reinterpret_cast( + page_data + PAGE_SIZE - bitmap_size); + cursor.prefix_sum_ptr = + page_index.page_prefix_sums[page_num].data(); + } + } + std::vector build_page_indices; std::vector probe_page_indices; + + // Base table page indices for deferred column resolution. + // Index = (table_id << 4) | col_idx, supports 16 tables Ɨ 16 cols = 256. + static constexpr size_t BASE_TABLE_SHIFT = 4; + static constexpr size_t MAX_BASE_TABLE_INDICES = 256; + std::array base_table_indices_; + std::array base_table_prepared_{}; + uint64_t base_table_version_ = 0; }; } // namespace Contest::io diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index f65218a..ec3db86 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -48,10 +48,25 @@ namespace row_id_ops { * @brief Write row IDs directly from columnar input. * * For columnar inputs, we just write the row_id directly (it's already - * the base table row ID). + * the base table row ID). Optimized with memcpy when batch fits in one page. */ inline size_t write_row_ids_direct(mema::DeferredTable &dest, size_t start_idx, const uint32_t *row_ids, size_t count) { + // Constants for DeferredTable layout + constexpr size_t ENTRY_SHIFT = mema::DeferredTable::ENTRY_SHIFT; + constexpr size_t ENTRY_MASK = mema::DeferredTable::ENTRY_MASK; + + size_t page_idx = start_idx >> ENTRY_SHIFT; + size_t offset = start_idx & ENTRY_MASK; + + // Fast path: entire batch fits in current page + if (offset + count <= mema::DeferredTable::ENTRIES_PER_PAGE) { + std::memcpy(&dest.pages[page_idx]->data[offset], row_ids, + count * sizeof(uint32_t)); + return count; + } + + // Slow path: batch spans pages for (size_t i = 0; i < count; ++i) { dest.write_at(start_idx + i, row_ids[i]); } @@ -226,6 +241,50 @@ inline void prepare_intermediate_columns( } } +/** + * @brief Prepare page indices for base table columns used in deferred + * resolution. + * + * Called before constructing intermediate results to enable O(log P) page + * lookup instead of O(P) linear scan when resolving deferred columns that need + * to materialize values from base tables. + * + * @param reader ColumnarReader to prepare page indices in. + * @param mat_sources Precomputed materialized column sources. + * @param analyzed_plan Full analyzed plan containing base tables. + */ +inline void prepare_deferred_base_tables( + ColumnarReader &reader, + const std::vector &mat_sources, + const AnalyzedPlan &analyzed_plan) { + if (!analyzed_plan.original_plan) + return; + + // NOTE: We do NOT reset base tables here - they persist across joins + // within the same query since the base tables don't change. + // reset_base_tables() should only be called once per query, externally. + + // Prepare page indices for each base table column that needs deferred + // resolve + for (const auto &src : mat_sources) { + if (src.needs_deferred_resolve) { + uint8_t table_id = src.base_table_id; + uint8_t col_idx = src.base_column_idx; + + if (!reader.is_base_column_prepared(table_id, col_idx)) { + if (table_id < analyzed_plan.original_plan->inputs.size()) { + const auto &base_table = + analyzed_plan.original_plan->inputs[table_id]; + if (col_idx < base_table.columns.size()) { + reader.prepare_base_column(table_id, col_idx, + base_table.columns[col_idx]); + } + } + } + } + } +} + /** * @brief Create empty intermediate result with proper schema. */ @@ -310,10 +369,12 @@ prepare_deferred_table_sources(const AnalyzedJoinNode &join_node, // is an IR index, but we need base table row IDs for // deferred resolution. This shouldn't happen if the // join key column is properly excluded from DEFER. +#ifndef NDEBUG std::fprintf(stderr, "[BUG] DEFER column %zu is child's " "join key - this is unexpected!\n", col.child_output_idx); +#endif src.needs_direct = true; src.child_table = nullptr; } else { @@ -472,6 +533,9 @@ void construct_intermediate_from_buffers( auto mat_sources = prepare_materialized_sources(join_node, build_input, probe_input, build_is_left); + // Prepare page indices for base tables used in deferred resolution + prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan); + // Pre-allocate pages using Page = mema::column_t::Page; using DeferredPage = mema::DeferredTable::Page; @@ -539,6 +603,7 @@ void construct_intermediate_from_buffers( size_t start = buffer_starts[t]; ColumnarReader::Cursor cursor; + ColumnarReader::Cursor base_cursor; // For deferred resolution reads // ==================================================================== // Process MATERIALIZED columns (column-major for cache locality) @@ -586,9 +651,10 @@ void construct_intermediate_from_buffers( analyzed_plan.original_plan ->inputs[src.base_table_id]; mema::value_t val = - columnar_reader.read_value_direct_public( + columnar_reader.read_base_table_value( base_table.columns[src.base_column_idx], - base_row, src.type); + src.base_table_id, src.base_column_idx, + base_row, src.type, base_cursor); dest_col.write_at(k++, val); } else { dest_col.write_at( @@ -912,6 +978,9 @@ void construct_intermediate_with_tuples( mat_sources.push_back(src); } + // Prepare page indices for base tables used in deferred resolution + prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan); + // Pre-allocate pages using Page = mema::column_t::Page; using DeferredPage = mema::DeferredTable::Page; @@ -982,6 +1051,7 @@ void construct_intermediate_with_tuples( size_t start = buffer_starts[t]; ColumnarReader::Cursor cursor; + ColumnarReader::Cursor base_cursor; // For deferred resolution reads // Process MATERIALIZED columns (excluding join key) for (const auto &src : mat_sources) { @@ -1024,9 +1094,10 @@ void construct_intermediate_with_tuples( analyzed_plan.original_plan ->inputs[src.base_table_id]; mema::value_t val = - columnar_reader.read_value_direct_public( + columnar_reader.read_base_table_value( base_table.columns[src.base_column_idx], - base_row, src.type); + src.base_table_id, src.base_column_idx, + base_row, src.type, base_cursor); dest_col.write_at(k++, val); } else { dest_col.write_at( From d1f2c21ce4bdb669d286655fc868125bad91a59f Mon Sep 17 00:00:00 2001 From: Themos Papatheofanous Date: Thu, 22 Jan 2026 20:53:26 +0200 Subject: [PATCH 09/13] feat: tuple storage --- include/data_model/intermediate.h | 10 ++ include/join_execution/hash_join.h | 32 ++--- include/join_execution/hashtable.h | 192 +++++++++++++++++------------ 3 files changed, 142 insertions(+), 92 deletions(-) diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index 53f6418..0b723d0 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -138,6 +138,16 @@ struct key_row_column_t { size_t row_count() const { return num_values; } void set_row_count(size_t count) { num_values = count; } + /// Release ownership of pages for zero-copy transfer to hashtable. + /// After this call, the column is empty (pages cleared, num_values = 0). + /// @return Vector of page pointers (caller takes ownership). + std::vector release_pages() && { + std::vector released = std::move(pages); + pages.clear(); + num_values = 0; + return released; + } + /// Pre-allocate pages from arena inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, size_t count) { diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h index 5895f11..f518df2 100644 --- a/include/join_execution/hash_join.h +++ b/include/join_execution/hash_join.h @@ -82,8 +82,7 @@ template inline std::vector> probe_intermediate(const UnchainedHashtable &hash_table, const mema::column_t &probe_column) { - const auto *keys = hash_table.keys(); - const auto *row_ids = hash_table.row_ids(); + const auto *entries = hash_table.entries(); size_t pool_size = THREAD_COUNT; std::vector> local_buffers(pool_size); @@ -124,8 +123,8 @@ probe_intermediate(const UnchainedHashtable &hash_table, hash_table.find_indices(key_val); for (uint64_t i = start_idx; i < end_idx; ++i) { - if (keys[i] == key_val) { - local_buf.add_match(row_ids[i], + if (entries[i].key == key_val) { + local_buf.add_match(entries[i].row_id, static_cast(idx)); } } @@ -151,8 +150,7 @@ inline std::vector> probe_columnar(const UnchainedHashtable &hash_table, const JoinInput &probe_input, size_t probe_attr) { - const auto *keys = hash_table.keys(); - const auto *row_ids = hash_table.row_ids(); + const auto *entries = hash_table.entries(); auto *table = std::get(probe_input.data); auto [actual_idx_col, _] = probe_input.node->output_attrs[probe_attr]; @@ -200,8 +198,9 @@ probe_columnar(const UnchainedHashtable &hash_table, hash_table.find_indices(key_val); for (uint64_t j = start_idx; j < end_idx; ++j) { - if (keys[j] == key_val) { - local_buf.add_match(row_ids[j], probe_row_id); + if (entries[j].key == key_val) { + local_buf.add_match(entries[j].row_id, + probe_row_id); } } probe_row_id++; @@ -222,8 +221,9 @@ probe_columnar(const UnchainedHashtable &hash_table, hash_table.find_indices(key_val); for (uint64_t j = start_idx; j < end_idx; ++j) { - if (keys[j] == key_val) { - local_buf.add_match(row_ids[j], probe_row_id); + if (entries[j].key == key_val) { + local_buf.add_match(entries[j].row_id, + probe_row_id); } } } @@ -254,8 +254,7 @@ inline std::vector> probe_tuples(const UnchainedHashtable &hash_table, const mema::key_row_column_t &probe_tuples) { - const auto *keys = hash_table.keys(); - const auto *row_ids = hash_table.row_ids(); + const auto *entries = hash_table.entries(); const size_t probe_count = probe_tuples.row_count(); const size_t num_pages = probe_tuples.pages.size(); @@ -292,10 +291,11 @@ probe_tuples(const UnchainedHashtable &hash_table, hash_table.find_indices(pair.key); for (uint64_t i = start_idx; i < end_idx; ++i) { - if (keys[i] == pair.key) { - // row_ids[i] = build side's row ID (base or IR) - // pair.row_id = probe side's row ID (base or IR) - local_buf.add_match(row_ids[i], pair.row_id); + if (entries[i].key == pair.key) { + // entries[i].row_id = build side's row ID (base or + // IR) pair.row_id = probe side's row ID (base or + // IR) + local_buf.add_match(entries[i].row_id, pair.row_id); } } } diff --git a/include/join_execution/hashtable.h b/include/join_execution/hashtable.h index 1ea10da..a39c3aa 100644 --- a/include/join_execution/hashtable.h +++ b/include/join_execution/hashtable.h @@ -57,12 +57,18 @@ using Contest::join::BLOOM_TAGS; */ class UnchainedHashtable { public: - /** @brief Key-rowid pair for hash table entries. */ + /** @brief Key-rowid pair for hash table entries (build phase). */ struct alignas(4) Tuple { int32_t key; /**< Join key value. */ uint32_t row_id; /**< Row index in source table. */ }; + /** @brief Fused key-rowid for cache-friendly probe (8-byte aligned). */ + struct alignas(8) Entry { + int32_t key; /**< Join key value. */ + uint32_t row_id; /**< Row index in source table. */ + }; + /** @brief L2-sized chunk for partition buffers. */ static constexpr size_t CHUNK_SIZE = 4096; static constexpr size_t CHUNK_HEADER = 16; @@ -126,28 +132,11 @@ class UnchainedHashtable { nullptr; /**< Arena for hash table allocations. */ Contest::platform::ArenaVector directory; /**< Slot entries: (end_offset << 16) | bloom_tag. */ - Contest::platform::ArenaVector - keys_; /**< Contiguous key storage, indexed by directory. */ - Contest::platform::ArenaVector - row_ids_; /**< Parallel row_id storage, same indexing. */ + Contest::platform::ArenaVector + entries_; /**< Fused key+row_id storage, indexed by directory. */ int shift = 0; /**< Bit shift for slot calculation: slot = hash >> (64-shift). */ - /** - * @brief CRC32-based hash with multiplicative mixing. - * @param key INT32 join key. - * @return 64-bit hash (upper bits index directory slot). - */ - static uint64_t hash_key(int32_t key) noexcept { - constexpr uint64_t k = 0x8648DBDB; -#if defined(__aarch64__) - uint32_t crc = __crc32w(0, static_cast(key)); -#else - uint32_t crc = _mm_crc32_u32(0, static_cast(key)); -#endif - return crc * ((k << 32) + 1); - } - /** * @brief Returns bloom tag from hash. Uses bits 32-42 to index BLOOM_TAGS. * @see bloom_tags.h @@ -231,8 +220,7 @@ class UnchainedHashtable { uint64_t h = hash_key(tup.key); size_t local_slot = slot_for(h) - slot_start; uint32_t idx = offsets[local_slot] + counts[local_slot]++; - keys_[idx] = tup.key; - row_ids_[idx] = tup.row_id; + entries_[idx] = {tup.key, tup.row_id}; directory[slot_start + local_slot] |= bloom_tag(h); } } @@ -253,7 +241,7 @@ class UnchainedHashtable { */ explicit UnchainedHashtable(size_t build_size) : arena_(&Contest::platform::get_arena(0)), directory(*arena_), - keys_(*arena_), row_ids_(*arena_) { + entries_(*arena_) { size_t pow2 = 2048; while (pow2 < build_size) pow2 <<= 1; @@ -262,17 +250,29 @@ class UnchainedHashtable { shift = __builtin_ctzll(pow2); } - /** @brief Number of keys in the hash table. */ - size_t size() const noexcept { return keys_.size(); } + /** @brief Number of entries in the hash table. */ + size_t size() const noexcept { return entries_.size(); } /** @brief True if hash table is empty. */ - bool empty() const noexcept { return keys_.empty(); } + bool empty() const noexcept { return entries_.empty(); } - /** @brief Direct access to key array for probe. */ - const int32_t *keys() const noexcept { return keys_.data(); } + /** @brief Direct access to fused entry array for probe. */ + const Entry *entries() const noexcept { return entries_.data(); } - /** @brief Direct access to row_id array for probe. */ - const uint32_t *row_ids() const noexcept { return row_ids_.data(); } + /** + * @brief CRC32-based hash with multiplicative mixing. Public for pre-hash. + * @param key INT32 join key. + * @return 64-bit hash (upper bits index directory slot). + */ + static uint64_t hash_key(int32_t key) noexcept { + constexpr uint64_t k = 0x8648DBDB; +#if defined(__aarch64__) + uint32_t crc = __crc32w(0, static_cast(key)); +#else + uint32_t crc = _mm_crc32_u32(0, static_cast(key)); +#endif + return crc * ((k << 32) + 1); + } /** * @brief Prefetch directory slot for a key to hide memory latency. @@ -286,13 +286,24 @@ class UnchainedHashtable { __builtin_prefetch(&directory[slot], 0, 2); } + /** + * @brief Prefetch directory slot using pre-computed hash. + * + * Avoids recomputing hash when already computed for another purpose. + * @param h Pre-computed hash from hash_key(). + */ + void prefetch_slot_prehashed(uint64_t h) const noexcept { + size_t slot = slot_for(h); + __builtin_prefetch(&directory[slot], 0, 2); + } + /** * @brief Find index range for keys matching probe key. * - * @return [start, end) into keys_/row_ids_; (0,0) if bloom rejects. + * @return [start, end) into entries_; (0,0) if bloom rejects. */ std::pair find_indices(int32_t key) const noexcept { - if (keys_.empty()) + if (entries_.empty()) return {0, 0}; uint64_t h = hash_key(key); @@ -308,6 +319,32 @@ class UnchainedHashtable { return {start, end}; } + /** + * @brief Find index range using pre-computed hash (avoids rehashing). + * + * Use when hash was already computed for prefetch or bloom filter check. + * @param key Original key (for comparison in caller). + * @param h Pre-computed hash from hash_key(key). + * @return [start, end) into entries_; (0,0) if bloom rejects. + */ + std::pair + find_indices_prehashed(int32_t key, uint64_t h) const noexcept { + (void)key; // Key used by caller for comparison, not needed here + if (entries_.empty()) + return {0, 0}; + + size_t slot = slot_for(h); + uint64_t entry = directory[slot]; + uint16_t tag = bloom_tag(h); + + if ((entry & tag) != tag) + return {0, 0}; + + uint64_t end = entry >> 16; + uint64_t start = (slot == 0) ? 0 : (directory[slot - 1] >> 16); + return {start, end}; + } + /** * @brief Build hash table from intermediate column_t. * @@ -376,8 +413,7 @@ class UnchainedHashtable { size_t total = global_offsets[num_partitions]; if (total == 0) return; - keys_.resize(total); - row_ids_.resize(total); + entries_.resize(total); // Build partitions in parallel const int nt = num_threads; @@ -393,54 +429,59 @@ class UnchainedHashtable { /** * @brief Build hash table from (key, row_id) tuple column. * - * Consumes tuples directly - row_ids are already in correct format - * (base table IDs or IR indices depending on how IR was constructed). - * More efficient than build_intermediate() as tuples match internal format. + * Radix-partitioned parallel build from key_row_column_t. + * Uses page-based work distribution for better cache locality. + * Each thread processes whole pages to avoid cross-page access. * * @param tuples Key-row tuple column from IntermediateResult. - * @param num_threads Thread count hint. + * @param num_threads Thread count hint (unused, uses pool size). */ void build_from_tuples(const mema::key_row_column_t &tuples, - int num_threads = 4) { + int /*num_threads*/ = 4) { const size_t row_count = tuples.row_count(); if (row_count == 0) return; - static constexpr size_t PARALLEL_BUILD_THRESHOLD = 10000; - num_threads = Contest::platform::worker_pool().thread_count(); - if (row_count < PARALLEL_BUILD_THRESHOLD) - num_threads = 1; - + const int pool_threads = Contest::platform::worker_pool().thread_count(); const size_t num_slots = directory.size(); const size_t num_partitions = - compute_num_partitions(row_count, num_threads); + compute_num_partitions(row_count, pool_threads); const int partition_bits = __builtin_ctzll(num_partitions); const size_t slots_per_partition = num_slots / num_partitions; - std::vector allocators(num_threads); - for (int t = 0; t < num_threads; ++t) + // Thread-local partitions for lock-free parallel partitioning + std::vector allocators(pool_threads); + for (int t = 0; t < pool_threads; ++t) allocators[t].set_arena(Contest::platform::get_arena(t)); - std::vector> thread_parts(num_threads); + std::vector> thread_parts(pool_threads); for (auto &tp : thread_parts) tp.resize(num_partitions); - // Partition phase - 8-byte tuple reads, cache-friendly streaming - size_t batch = (row_count + num_threads - 1) / num_threads; - Contest::platform::worker_pool().execute([&, partition_bits](size_t t) { - size_t start = t * batch; - size_t end = std::min(start + batch, row_count); - if (start >= end) - return; - const int shift = 64 - partition_bits; - for (size_t i = start; i < end; ++i) { - mema::KeyRowPair pair = tuples[i]; - uint64_t h = hash_key(pair.key); - size_t p = (partition_bits == 0) ? 0 : (h >> shift); - // Direct use of tuple - no conversion needed - thread_parts[t][p].append(allocators[t], - {pair.key, pair.row_id}); - } - }); + // Page-based partition phase - each thread processes whole pages + constexpr size_t PAIRS_PER_PAGE = mema::key_row_column_t::PAIRS_PER_PAGE; + const size_t num_pages = tuples.pages.size(); + + Contest::platform::worker_pool().execute( + [&, partition_bits, pool_threads](size_t t) { + const int shift = 64 - partition_bits; + const size_t stride = static_cast(pool_threads); + for (size_t pg = t; pg < num_pages; pg += stride) { + // Prefetch next page + if (pg + stride < num_pages) { + __builtin_prefetch(tuples.pages[pg + stride]->data, 0, 3); + } + const auto *page_data = tuples.pages[pg]->data; + size_t base = pg * PAIRS_PER_PAGE; + size_t count = std::min(PAIRS_PER_PAGE, row_count - base); + for (size_t i = 0; i < count; ++i) { + const auto &pair = page_data[i]; + uint64_t h = hash_key(pair.key); + size_t p = (partition_bits == 0) ? 0 : (h >> shift); + thread_parts[t][p].append(allocators[t], + {pair.key, pair.row_id}); + } + } + }); // Compute global offsets from per-thread counts Contest::platform::ArenaVector global_offsets(*arena_); @@ -448,7 +489,7 @@ class UnchainedHashtable { std::memset(global_offsets.data(), 0, (num_partitions + 1) * sizeof(size_t)); for (size_t p = 0; p < num_partitions; ++p) { - for (size_t t = 0; t < num_threads; ++t) { + for (int t = 0; t < pool_threads; ++t) { global_offsets[p + 1] += thread_parts[t][p].total_count; } global_offsets[p + 1] += global_offsets[p]; @@ -457,16 +498,16 @@ class UnchainedHashtable { size_t total = global_offsets[num_partitions]; if (total == 0) return; - keys_.resize(total); - row_ids_.resize(total); + entries_.resize(total); // Build partitions in parallel - const int nt = num_threads; - Contest::platform::worker_pool().execute([&, nt](size_t t) { - for (size_t p = t; p < num_partitions; p += nt) { - build_partition( - thread_parts, p, slots_per_partition, global_offsets[p], - global_offsets[p + 1] - global_offsets[p], nt, t); + Contest::platform::worker_pool().execute([&, pool_threads](size_t t) { + for (size_t p = t; p < num_partitions; + p += static_cast(pool_threads)) { + build_partition(thread_parts, p, slots_per_partition, + global_offsets[p], + global_offsets[p + 1] - global_offsets[p], + pool_threads, t); } }); } @@ -570,8 +611,7 @@ class UnchainedHashtable { size_t total = global_offsets[num_partitions]; if (total == 0) return; - keys_.resize(total); - row_ids_.resize(total); + entries_.resize(total); const int nt = num_threads; Contest::platform::worker_pool().execute([&, nt](size_t t) { From 68ee4082891431d8a0e5290f507e38b9d0956465 Mon Sep 17 00:00:00 2001 From: themosgit Date: Sat, 24 Jan 2026 01:08:26 +0200 Subject: [PATCH 10/13] WIP --- CMakeLists.txt | 2 +- include/platform/hardware.h | 4 +-- src/execute.cpp | 71 +++++++++++++++++++++++++++---------- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc63a09..76039bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,7 +98,7 @@ set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64") add_compile_options(-O3 -mcpu=apple-m1 -flto) else() - add_compile_options(-O3 -march=native -m64 -mcrc32 -fpermissive -flto) + add_compile_options(-O3 -march=skylake -m64 -mcrc32 -fpermissive -flto) endif() add_link_options(-flto) diff --git a/include/platform/hardware.h b/include/platform/hardware.h index 83ef443..0cbb011 100644 --- a/include/platform/hardware.h +++ b/include/platform/hardware.h @@ -10,8 +10,8 @@ */ #pragma once -#define SPC__CORE_COUNT 8 -#define SPC__THREAD_COUNT 16 +#define SPC__CORE_COUNT 6 +#define SPC__THREAD_COUNT 6 #define SPC__LEVEL1_DCACHE_SIZE 32768 #define SPC__LEVEL2_CACHE_SIZE 1048576 #define SPC__LEVEL3_CACHE_SIZE 33554432 diff --git a/src/execute.cpp b/src/execute.cpp index 1cb0e10..ab935d5 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -16,7 +16,13 @@ * * @see plan.h, match_collector.h, materialize.h, construct_intermediate.h */ +#include #include +#include +#include +#include +#include +#include #if defined(__APPLE__) && defined(__aarch64__) #include #elif defined(SPC__USE_BENCHMARKVM_HARDWARE) @@ -45,7 +51,6 @@ namespace Contest { using namespace join; -using materialize::construct_intermediate_from_buffers; using materialize::construct_intermediate_with_tuples; using materialize::create_empty_intermediate_result; using materialize::materialize_from_buffers; @@ -212,26 +217,15 @@ JoinResult execute_join_with_mode( auto inter_start = std::chrono::high_resolution_clock::now(); IntermediateResult result; if (total_matches > 0) { - // Prepare page indices for intermediate construction - // Pass parent_join_key_idx so the key column is prepared for tuple - // population materialize::prepare_intermediate_columns( columnar_reader, build_input, probe_input, join_node, config.remapped_attrs, build_input.output_size(), config.build_left, join_node.parent_join_key_idx); - // Use tuple-based construction if parent needs a join key - if (join_node.parent_join_key_idx.has_value()) { - construct_intermediate_with_tuples( - match_buffers, build_input, probe_input, join_node, config, - config.build_left, *join_node.parent_join_key_idx, - columnar_reader, result, plan); - } else { - construct_intermediate_from_buffers( - match_buffers, build_input, probe_input, join_node, - config.remapped_attrs, build_input.output_size(), - config.build_left, columnar_reader, result, plan); - } + construct_intermediate_with_tuples( + match_buffers, build_input, probe_input, join_node, config, + config.build_left, *join_node.parent_join_key_idx, + columnar_reader, result, plan); } else { result = create_empty_intermediate_result(join_node); } @@ -356,6 +350,43 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, return IntermediateResult{}; } +/** + * + * @brief Prints the plan tree with metadata. + * + * @param the query plan itself. + * @param queue that should contain the root node. + * + **/ +static std::function>)> +print_plan = [](const Plan& plan, std::queue> q) { + if (q.empty()) return; + int initial_size = q.size(); + for (int i = 0; i < initial_size; i++) { + auto [parent_idx, node_idx] = q.front(); + q.pop(); + const auto& node = plan.nodes[node_idx]; + std::cout << "parent: " << parent_idx << ", node: "<< node_idx << " size: " + << node.output_attrs.size() << " pairs: { "; + for (int i = 0; i < node.output_attrs.size(); i++) { + auto [col, type] = node.output_attrs[i]; + if (DataType::INT32 == type) + std::cout << "(" << col << ", INT32)-"; + else + std::cout << "(" << col << ", STR)-"; + } + if (const auto* join = std::get_if(&node.data)) { + std::cout << "left_key: " << join->left_attr; + std::cout << " right_key: " << join->right_attr; + q.emplace(node_idx ,join->left); + q.emplace(node_idx, join->right); + } + std::cout << "}\n"; + } + std::cout << std::endl << std::endl << std::endl << std::endl ; + print_plan(plan, std::move(q)); +}; + /** * @brief Public entry point: execute plan from root, return ColumnarTable. * @param plan Query plan with nodes and base tables. @@ -381,9 +412,13 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, std::chrono::duration_cast(analyze_end - analyze_start) .count(); - + /* auto result = execute_impl(analyzed_plan, plan.root, true, stats); ColumnarTable final_result = std::get(std::move(result)); + */ + std::queue> q; + q.emplace(0, plan.root); + print_plan(plan, q); auto total_end = std::chrono::high_resolution_clock::now(); auto total_elapsed = std::chrono::duration_cast( @@ -417,7 +452,7 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, *stats_out = stats; } - return std::move(final_result); + return ColumnarTable(); } void *build_context() { return nullptr; } From 83a1aea639e1ef42b9533b313c1938b68176e0da Mon Sep 17 00:00:00 2001 From: themosgit Date: Sat, 24 Jan 2026 01:10:31 +0200 Subject: [PATCH 11/13] chore: cleanup --- CMakeLists.txt | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 76039bc..53ed07b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,33 +59,6 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le") set(SKIP_EXTENSIONS jemalloc) endif() -# Detect Xeon E5-2680 v3 CPU for benchmark VM hardware configuration -# Requires both: correct CPU AND at least 32GB RAM (benchmark VM has 64GB, CI has 4GB) -set(IS_BENCHMARK_VM_HARDWARE OFF) -if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND EXISTS "/proc/cpuinfo") - file(READ "/proc/cpuinfo" CPUINFO_CONTENT) - if(CPUINFO_CONTENT MATCHES "E5-2680 v3") - # Check available memory to distinguish benchmark VM from CI VM - if(EXISTS "/proc/meminfo") - file(READ "/proc/meminfo" MEMINFO_CONTENT) - string(REGEX MATCH "MemTotal:[ \t]+([0-9]+)" MEM_MATCH "${MEMINFO_CONTENT}") - if(MEM_MATCH) - set(MEM_TOTAL_KB "${CMAKE_MATCH_1}") - math(EXPR MEM_TOTAL_GB "${MEM_TOTAL_KB} / 1024 / 1024") - if(MEM_TOTAL_GB GREATER_EQUAL 32) - message(STATUS "Detected Intel Xeon E5-2680 v3 CPU with ${MEM_TOTAL_GB}GB RAM - using benchmark VM hardware configuration") - add_compile_definitions(SPC__USE_BENCHMARKVM_HARDWARE) - set(IS_BENCHMARK_VM_HARDWARE ON) - else() - message(STATUS "Detected Intel Xeon E5-2680 v3 CPU but only ${MEM_TOTAL_GB}GB RAM (need >=32GB) - using generic hardware configuration") - endif() - endif() - endif() - endif() -endif() - -# Include all sources from /src directory. CONFIGURE_DEPENDS can be unreliable. -# Try re-running cmake in case changes are not recognized. file(GLOB ALL_SRC CONFIGURE_DEPENDS "src/*.cpp" @@ -110,13 +83,6 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "Windows") target_compile_definitions(faster PRIVATE) target_link_libraries(faster PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser) target_include_directories(faster PRIVATE include) - - if(IS_BENCHMARK_VM_HARDWARE) - add_executable(leaderboard ${MANOLATES_SRC} tests/read_sql.cpp) - target_compile_definitions(leaderboard PRIVATE) - target_link_libraries(leaderboard PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser) - target_include_directories(leaderboard PRIVATE include) - endif() endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64") From 4b644e716f43ba3c68409268b0a8f8d56885e7b1 Mon Sep 17 00:00:00 2001 From: themosgit Date: Sat, 24 Jan 2026 15:22:27 +0200 Subject: [PATCH 12/13] fix: migrating workflow --- .github/workflows/benchmark.yml | 29 -------------------- .github/workflows/ci.yml | 31 ++++++++++++++++----- .github/workflows/notifier.yaml | 48 --------------------------------- .gitignore | 2 ++ flake.nix | 11 ++++---- 5 files changed, 32 insertions(+), 89 deletions(-) delete mode 100644 .github/workflows/benchmark.yml delete mode 100644 .github/workflows/notifier.yaml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index d854df3..0000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Workflow for leaderboard submission - -on: - # push: - # branches: [ main ] - # pull_request: - # branches: [ main ] - workflow_dispatch: -jobs: - leaderboard: - name: leaderboard - runs-on: - group: benchmark - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Configure CMake - run: | - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev - - - name: Build project - run: | - cmake --build build -- -j$(nproc) leaderboard - - - name: Run for leaderboard - run: | - leaderboard.sh - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9ccf48b..643c1de 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,27 +4,44 @@ on: push: branches: - main + - opt pull_request: branches: - main - - workflow_dispatch: - jobs: build_and_test: - runs-on: - group: k23a + runs-on: self-hosted + + env: + CCACHE_DIR: ${{ github.workspace }}/.ccache steps: - name: Checkout repository code uses: actions/checkout@v4 + + - name: Setup cache + uses: actions/cache@v4 + with: + path: .ccache + key: ${{ runner.os }}-ccache-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-ccache- - name: Configure initial build with CMake run: | - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev + nix develop -c \ + cmake -S . -B build -Wno-dev \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - name: Build all targets - run: cmake --build build -- -j $(nproc) unit_tests + run: | + nix develop -c \ + cmake --build build -- -j $(nproc) unit_tests - name: Run unit tests run: ./build/unit_tests + + - name: Cache stats + run: nix develop -c ccache -s diff --git a/.github/workflows/notifier.yaml b/.github/workflows/notifier.yaml deleted file mode 100644 index 5f9b68b..0000000 --- a/.github/workflows/notifier.yaml +++ /dev/null @@ -1,48 +0,0 @@ -name: GitHub Push Notifier - -on: - push: - branches: - - '**' - -jobs: - notify: - runs-on: - group: k23a - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Node.js - uses: actions/setup-node@v3 - with: - node-version: '18' - - - name: Install dependencies - run: npm install discord.js node-fetch dotenv - - - name: Send Discord notification - env: - DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} - CHANNEL_ID: ${{ secrets.CHANNEL_ID }} - GITHUB_TOKEN: ${{ vars.GH_TOKEN }} - GITHUB_OWNER: ${{ github.repository_owner }} - GITHUB_REPO: ${{ github.event.repository.name }} - run: | - node -e " - require('dotenv').config(); - const { Client, GatewayIntentBits } = require('discord.js'); - const client = new Client({ intents: [GatewayIntentBits.Guilds] }); - - client.once('ready', async () => { - const channel = client.channels.cache.get(process.env.CHANNEL_ID); - const message = \`🚨 New Push to \\\`${{ github.repository }}\\\` Branch \\\`${{ github.ref_name }}\\\`!\nšŸ‘¤ Author: \\\`${{ github.event.pusher.name }}\\\`\nšŸ“ Commit: \\\`${{ github.event.head_commit.message }}\\\`\nšŸ”— View: ${{ github.event.head_commit.url }}\`; - await channel.send(message); - process.exit(0); - }); - - client.login(process.env.DISCORD_TOKEN); - " \ No newline at end of file diff --git a/.gitignore b/.gitignore index f1c8719..c21caa5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ /docs/html /docs/xml .clangd +.cache +.ccache compile_commands.json /env/ script.py diff --git a/flake.nix b/flake.nix index cd6af67..1f459a3 100644 --- a/flake.nix +++ b/flake.nix @@ -23,19 +23,19 @@ buildInputs = with pkgs; [ llvmPackages.libcxxClang llvmPackages.libllvm + ccache doxygen curl git cmake typst ] ++ lib.optionals (system == "x86_64-linux") [ - linuxPackages.perf + perf gef ]; shellHook = '' CLANGD_FILE=".clangd" CPP_STANDARD="c++20" - echo "Generating $CLANGD_FILE from \$ clang++ -v output..." INCLUDE_PATHS=$( @@ -57,9 +57,10 @@ echo " - -I$CLEAN_PATH" >> $CLANGD_FILE done <<< "$INCLUDE_PATHS" - echo " - -O2" >> $CLANGD_FILE - - echo "Generation of $CLANGD_FILE complete." + echo "exporting ccache paths..." + export CCACHE_DIR="$PWD/.ccache" + export PATH="${pkgs.ccache}/bin:$PATH" + echo "done." if command -v fish &> /dev/null; then exec fish From b3355047d1062ac8601082c285820394cf07697b Mon Sep 17 00:00:00 2001 From: themosgit Date: Sat, 24 Jan 2026 22:18:40 +0200 Subject: [PATCH 13/13] WIP --- .github/workflows/ci.yml | 1 - CMakeLists.txt | 4 +- include/data_model/intermediate.h | 4 +- src/execute.cpp | 63 +++++++++++++++++++++---------- 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 643c1de..b66ff8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - opt pull_request: branches: - main diff --git a/CMakeLists.txt b/CMakeLists.txt index 53ed07b..ab5f3ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,7 +71,7 @@ set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64") add_compile_options(-O3 -mcpu=apple-m1 -flto) else() - add_compile_options(-O3 -march=skylake -m64 -mcrc32 -fpermissive -flto) + add_compile_options(-O0 -march=skylake -m64 -mcrc32 -fpermissive) endif() add_link_options(-flto) @@ -88,7 +88,7 @@ endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64") target_compile_options(unit_tests PRIVATE -O3 -mcpu=apple-m1 -flto) else() - target_compile_options(unit_tests PRIVATE -O3 -march=native -m64 -fpermissive -flto) + target_compile_options(unit_tests PRIVATE -O0 -march=skylake -m64 -fpermissive) endif() target_compile_definitions(unit_tests PRIVATE) diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index 0b723d0..e8cefe7 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -55,6 +55,7 @@ struct alignas(8) KeyRowPair { }; /** + * * @brief Column of (key, row_id) tuples for join key storage. * * Enables accelerated hashtable build (tuples match internal format) and @@ -62,7 +63,8 @@ struct alignas(8) KeyRowPair { * separate column_t for join key columns. * * Memory layout: 16KB pages containing 2048 KeyRowPair entries each. - */ + * + **/ struct key_row_column_t { static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB static constexpr size_t PAIRS_PER_PAGE = diff --git a/src/execute.cpp b/src/execute.cpp index ab935d5..ce81a31 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -16,6 +16,7 @@ * * @see plan.h, match_collector.h, materialize.h, construct_intermediate.h */ +#include "data_model/plan.h" #include #include #include @@ -350,6 +351,7 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, return IntermediateResult{}; } + /** * * @brief Prints the plan tree with metadata. @@ -358,35 +360,58 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, * @param queue that should contain the root node. * **/ -static std::function>)> -print_plan = [](const Plan& plan, std::queue> q) { +static std::function>, int)> +print_plan = [](const Plan& plan, std::queue> q, int table_id) { if (q.empty()) return; int initial_size = q.size(); for (int i = 0; i < initial_size; i++) { - auto [parent_idx, node_idx] = q.front(); + auto [node_idx, parent_attr] = q.front(); q.pop(); const auto& node = plan.nodes[node_idx]; - std::cout << "parent: " << parent_idx << ", node: "<< node_idx << " size: " - << node.output_attrs.size() << " pairs: { "; + if (std::holds_alternative(node.data)) { + continue; + } + const auto data = std::get(node.data); + + std::cout << " node: "<< node_idx << " size: " + << node.output_attrs.size() << std::endl; + + bool match_left = false; + bool match_right = false; for (int i = 0; i < node.output_attrs.size(); i++) { auto [col, type] = node.output_attrs[i]; - if (DataType::INT32 == type) - std::cout << "(" << col << ", INT32)-"; - else - std::cout << "(" << col << ", STR)-"; - } - if (const auto* join = std::get_if(&node.data)) { - std::cout << "left_key: " << join->left_attr; - std::cout << " right_key: " << join->right_attr; - q.emplace(node_idx ,join->left); - q.emplace(node_idx, join->right); + if (node_idx != plan.root) { + if (i == parent_attr) std::cout << "build->"; + else std::cout << "defer->"; + } + if (col < plan.nodes[data.left].output_attrs.size()) { + std::cout << "left->"; + match_left = true; + } else { + std::cout << "right->"; + match_right = true; + } + if (DataType::INT32 == type) std::cout << "(" << col << ", INT32)"; + else std::cout << "(" << col << ", STR)"; + std::cout << std::endl; + } - std::cout << "}\n"; + std::cout << "===="; + if (match_left && match_right) std::cout << "Match both"; + else if (match_left) std::cout << "Match left"; + else std::cout << "Match right"; + std::cout << "====" << std::endl; + + std::cout << "left_key: " << data.left_attr << " left child: " << data.left; + std::cout << "\nright_key: " << data.right_attr << " right child: " << data.right; + q.emplace(data.left, data.left_attr); + q.emplace(data.right, data.right_attr); + std::cout << "\n\n\n\n\n"; } - std::cout << std::endl << std::endl << std::endl << std::endl ; print_plan(plan, std::move(q)); }; + /** * @brief Public entry point: execute plan from root, return ColumnarTable. * @param plan Query plan with nodes and base tables. @@ -416,8 +441,8 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, auto result = execute_impl(analyzed_plan, plan.root, true, stats); ColumnarTable final_result = std::get(std::move(result)); */ - std::queue> q; - q.emplace(0, plan.root); + std::queue> q; + q.emplace(plan.root, 0); print_plan(plan, q); auto total_end = std::chrono::high_resolution_clock::now();