diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index d854df3..0000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Workflow for leaderboard submission - -on: - # push: - # branches: [ main ] - # pull_request: - # branches: [ main ] - workflow_dispatch: -jobs: - leaderboard: - name: leaderboard - runs-on: - group: benchmark - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Configure CMake - run: | - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev - - - name: Build project - run: | - cmake --build build -- -j$(nproc) leaderboard - - - name: Run for leaderboard - run: | - leaderboard.sh - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9ccf48b..b66ff8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,24 +7,40 @@ on: pull_request: branches: - main - - workflow_dispatch: - jobs: build_and_test: - runs-on: - group: k23a + runs-on: self-hosted + + env: + CCACHE_DIR: ${{ github.workspace }}/.ccache steps: - name: Checkout repository code uses: actions/checkout@v4 + + - name: Setup cache + uses: actions/cache@v4 + with: + path: .ccache + key: ${{ runner.os }}-ccache-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-ccache- - name: Configure initial build with CMake run: | - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev + nix develop -c \ + cmake -S . -B build -Wno-dev \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - name: Build all targets - run: cmake --build build -- -j $(nproc) unit_tests + run: | + nix develop -c \ + cmake --build build -- -j $(nproc) unit_tests - name: Run unit tests run: ./build/unit_tests + + - name: Cache stats + run: nix develop -c ccache -s diff --git a/.github/workflows/notifier.yaml b/.github/workflows/notifier.yaml deleted file mode 100644 index 5f9b68b..0000000 --- a/.github/workflows/notifier.yaml +++ /dev/null @@ -1,48 +0,0 @@ -name: GitHub Push Notifier - -on: - push: - branches: - - '**' - -jobs: - notify: - runs-on: - group: k23a - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Node.js - uses: actions/setup-node@v3 - with: - node-version: '18' - - - name: Install dependencies - run: npm install discord.js node-fetch dotenv - - - name: Send Discord notification - env: - DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} - CHANNEL_ID: ${{ secrets.CHANNEL_ID }} - GITHUB_TOKEN: ${{ vars.GH_TOKEN }} - GITHUB_OWNER: ${{ github.repository_owner }} - GITHUB_REPO: ${{ github.event.repository.name }} - run: | - node -e " - require('dotenv').config(); - const { Client, GatewayIntentBits } = require('discord.js'); - const client = new Client({ intents: [GatewayIntentBits.Guilds] }); - - client.once('ready', async () => { - const channel = client.channels.cache.get(process.env.CHANNEL_ID); - const message = \`🚨 New Push to \\\`${{ github.repository }}\\\` Branch \\\`${{ github.ref_name }}\\\`!\nšŸ‘¤ Author: \\\`${{ github.event.pusher.name }}\\\`\nšŸ“ Commit: \\\`${{ github.event.head_commit.message }}\\\`\nšŸ”— View: ${{ github.event.head_commit.url }}\`; - await channel.send(message); - process.exit(0); - }); - - client.login(process.env.DISCORD_TOKEN); - " \ No newline at end of file diff --git a/.gitignore b/.gitignore index 12c22e9..c21caa5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,12 @@ /docs/html /docs/xml .clangd +.cache +.ccache compile_commands.json /env/ script.py *.md +/build_deferred +/build_debug +/build_eager diff --git a/CMakeLists.txt b/CMakeLists.txt index dc0739a..ab5f3ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,40 +54,11 @@ FetchContent_Declare( FetchContent_MakeAvailable(fmtlib) -set(ENABLE_SANITIZER OFF) -set(ENABLE_UBSAN OFF) if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le") message("Disabling jemalloc extension of DuckDB on Power.") set(SKIP_EXTENSIONS jemalloc) endif() -# Detect Xeon E5-2680 v3 CPU for benchmark VM hardware configuration -# Requires both: correct CPU AND at least 32GB RAM (benchmark VM has 64GB, CI has 4GB) -set(IS_BENCHMARK_VM_HARDWARE OFF) -if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND EXISTS "/proc/cpuinfo") - file(READ "/proc/cpuinfo" CPUINFO_CONTENT) - if(CPUINFO_CONTENT MATCHES "E5-2680 v3") - # Check available memory to distinguish benchmark VM from CI VM - if(EXISTS "/proc/meminfo") - file(READ "/proc/meminfo" MEMINFO_CONTENT) - string(REGEX MATCH "MemTotal:[ \t]+([0-9]+)" MEM_MATCH "${MEMINFO_CONTENT}") - if(MEM_MATCH) - set(MEM_TOTAL_KB "${CMAKE_MATCH_1}") - math(EXPR MEM_TOTAL_GB "${MEM_TOTAL_KB} / 1024 / 1024") - if(MEM_TOTAL_GB GREATER_EQUAL 32) - message(STATUS "Detected Intel Xeon E5-2680 v3 CPU with ${MEM_TOTAL_GB}GB RAM - using benchmark VM hardware configuration") - add_compile_definitions(SPC__USE_BENCHMARKVM_HARDWARE) - set(IS_BENCHMARK_VM_HARDWARE ON) - else() - message(STATUS "Detected Intel Xeon E5-2680 v3 CPU but only ${MEM_TOTAL_GB}GB RAM (need >=32GB) - using generic hardware configuration") - endif() - endif() - endif() - endif() -endif() - -# Include all sources from /src directory. CONFIGURE_DEPENDS can be unreliable. -# Try re-running cmake in case changes are not recognized. file(GLOB ALL_SRC CONFIGURE_DEPENDS "src/*.cpp" @@ -100,7 +71,7 @@ set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64") add_compile_options(-O3 -mcpu=apple-m1 -flto) else() - add_compile_options(-O3 -march=native -m64 -mcrc32 -fpermissive -flto) + add_compile_options(-O0 -march=skylake -m64 -mcrc32 -fpermissive) endif() add_link_options(-flto) @@ -112,19 +83,12 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "Windows") target_compile_definitions(faster PRIVATE) target_link_libraries(faster PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser) target_include_directories(faster PRIVATE include) - - if(IS_BENCHMARK_VM_HARDWARE) - add_executable(leaderboard ${MANOLATES_SRC} tests/read_sql.cpp) - target_compile_definitions(leaderboard PRIVATE) - target_link_libraries(leaderboard PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser) - target_include_directories(leaderboard PRIVATE include) - endif() endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64") target_compile_options(unit_tests PRIVATE -O3 -mcpu=apple-m1 -flto) else() - target_compile_options(unit_tests PRIVATE -O3 -march=native -m64 -fpermissive -flto) + target_compile_options(unit_tests PRIVATE -O0 -march=skylake -m64 -fpermissive) endif() target_compile_definitions(unit_tests PRIVATE) diff --git a/flake.nix b/flake.nix index cd6af67..1f459a3 100644 --- a/flake.nix +++ b/flake.nix @@ -23,19 +23,19 @@ buildInputs = with pkgs; [ llvmPackages.libcxxClang llvmPackages.libllvm + ccache doxygen curl git cmake typst ] ++ lib.optionals (system == "x86_64-linux") [ - linuxPackages.perf + perf gef ]; shellHook = '' CLANGD_FILE=".clangd" CPP_STANDARD="c++20" - echo "Generating $CLANGD_FILE from \$ clang++ -v output..." INCLUDE_PATHS=$( @@ -57,9 +57,10 @@ echo " - -I$CLEAN_PATH" >> $CLANGD_FILE done <<< "$INCLUDE_PATHS" - echo " - -O2" >> $CLANGD_FILE - - echo "Generation of $CLANGD_FILE complete." + echo "exporting ccache paths..." + export CCACHE_DIR="$PWD/.ccache" + export PATH="${pkgs.ccache}/bin:$PATH" + echo "done." if command -v fish &> /dev/null; then exec fish diff --git a/include/data_access/columnar_reader.h b/include/data_access/columnar_reader.h index 2074498..39348d0 100644 --- a/include/data_access/columnar_reader.h +++ b/include/data_access/columnar_reader.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -266,6 +267,127 @@ class ColumnarReader { global_probe_version.fetch_add(1, std::memory_order_relaxed); } + // ======================================================================== + // Base Table Page Index Methods (for O(1) deferred column resolution) + // ======================================================================== + + /** @brief Reset base table prepared flags for new query. */ + inline void reset_base_tables() { + base_table_prepared_.fill(false); + base_table_version_++; + } + + /** + * @brief Prepare page index for a base table column. + * + * Called once per unique (table_id, col_idx) before deferred resolution. + * Enables O(log P) page lookup instead of O(P) linear scan per read. + * + * @param table_id Base table ID (0-15). + * @param col_idx Column index within base table (0-15). + * @param column The Column to build page index for. + */ + inline void prepare_base_column(uint8_t table_id, uint8_t col_idx, + const Column &column) { + size_t idx = (static_cast(table_id) << BASE_TABLE_SHIFT) | + static_cast(col_idx); + if (idx >= MAX_BASE_TABLE_INDICES) + return; + + if (!base_table_prepared_[idx]) { + auto &arena = platform::get_arena(0); + base_table_indices_[idx] = PageIndex(arena); + base_table_indices_[idx].build(column); + base_table_prepared_[idx] = true; + } + } + + /** @brief Check if base column page index is prepared. */ + inline bool is_base_column_prepared(uint8_t table_id, + uint8_t col_idx) const { + size_t idx = (static_cast(table_id) << BASE_TABLE_SHIFT) | + static_cast(col_idx); + return idx < MAX_BASE_TABLE_INDICES && base_table_prepared_[idx]; + } + + /** + * @brief Read value from base table using prepared page index. + * + * O(1) with cursor caching for sequential access, O(log P) on cache miss. + * Falls back to O(P) linear scan if page index not prepared. + * + * @param column The base table column. + * @param table_id Base table ID. + * @param col_idx Column index within base table. + * @param row_id Row ID within the column. + * @param data_type Data type of the column. + * @param cursor Thread-local cursor for caching. + * @return The value at the specified row. + */ + inline mema::value_t read_base_table_value(const Column &column, + uint8_t table_id, + uint8_t col_idx, uint32_t row_id, + DataType data_type, + Cursor &cursor) const { + size_t idx = (static_cast(table_id) << BASE_TABLE_SHIFT) | + static_cast(col_idx); + + if (idx >= MAX_BASE_TABLE_INDICES || !base_table_prepared_[idx]) { + // Fallback to O(P) linear scan + return read_value_direct(column, row_id, data_type); + } + + const PageIndex &page_index = base_table_indices_[idx]; + + // Dense INT32 fast path: O(1) arithmetic lookup + if (data_type == DataType::INT32 && page_index.is_dense_int32) { + return mema::value_t{read_dense_int32(page_index, row_id)}; + } + + // Check cursor cache (version uses base_table_version_ + idx for + // uniqueness) + uint64_t effective_version = base_table_version_ + idx; + bool cache_hit = + cursor.version == effective_version && cursor.cached_col == idx && + row_id >= cursor.cached_start && row_id < cursor.cached_end; + + if (!cache_hit) { + // Check sequential access optimization + if (cursor.version == effective_version && + cursor.cached_col == idx && row_id == cursor.cached_end) { + size_t next_page = cursor.cached_page + 1; + if (next_page < page_index.cumulative_rows.size()) { + load_page_into_cursor_base(column, page_index, next_page, + idx, effective_version, cursor); + } else { + // Past end of data + return mema::value_t{mema::value_t::NULL_VALUE}; + } + } else { + // Binary search for page + size_t page_num = page_index.find_page(row_id); + load_page_into_cursor_base(column, page_index, page_num, idx, + effective_version, cursor); + } + } + + // Now cursor is loaded for the correct page + uint32_t local_row = row_id - cursor.cached_start; + if (SPC_LIKELY(cursor.is_dense)) { + if (data_type == DataType::INT32) { + return mema::value_t{cursor.data_ptr[local_row]}; + } else { + return mema::value_t::encode_string( + cursor.page_idx_val, static_cast(local_row)); + } + } + if (SPC_UNLIKELY(cursor.is_special)) { + return mema::value_t::encode_string( + cursor.page_idx_val, mema::value_t::LONG_STRING_OFFSET); + } + return read_sparse(local_row, data_type, cursor); + } + /** @brief Fast path: check cursor cache, dispatch to appropriate handler. */ template @@ -275,10 +397,15 @@ class ColumnarReader { /* Dense INT32 fast path: O(1) arithmetic lookup, bypasses cursor */ if (data_type == DataType::INT32) { - const PageIndex &page_index = IsBuild ? build_page_indices[col_idx] + size_t pidx_size = + IsBuild ? build_page_indices.size() : probe_page_indices.size(); + if (SPC_LIKELY(col_idx < pidx_size)) { + const PageIndex &page_index = IsBuild + ? build_page_indices[col_idx] : probe_page_indices[col_idx]; - if (SPC_LIKELY(page_index.is_dense_int32)) { - return mema::value_t{read_dense_int32(page_index, row_id)}; + if (SPC_LIKELY(page_index.is_dense_int32)) { + return mema::value_t{read_dense_int32(page_index, row_id)}; + } } } @@ -291,10 +418,10 @@ class ColumnarReader { global_probe_version.load(std::memory_order_relaxed); } - if (SPC_LIKELY(cursor.version == current_version && - col_idx == cursor.cached_col && - row_id >= cursor.cached_start && - row_id < cursor.cached_end)) { + bool cache_hit = + cursor.version == current_version && col_idx == cursor.cached_col && + row_id >= cursor.cached_start && row_id < cursor.cached_end; + if (SPC_LIKELY(cache_hit)) { uint32_t local_row = row_id - cursor.cached_start; if (SPC_LIKELY(cursor.is_dense)) { if (data_type == DataType::INT32) { @@ -313,9 +440,11 @@ class ColumnarReader { } /* sequential access optimization: skip binary search for next page */ + size_t pidx_count = + IsBuild ? build_page_indices.size() : probe_page_indices.size(); if (SPC_LIKELY(cursor.version == current_version && col_idx == cursor.cached_col && - row_id == cursor.cached_end)) { + row_id == cursor.cached_end && col_idx < pidx_count)) { const PageIndex &page_index = IsBuild ? build_page_indices[col_idx] : probe_page_indices[col_idx]; size_t next_page = cursor.cached_page + 1; @@ -384,6 +513,12 @@ class ColumnarReader { Cursor &cursor, uint64_t current_version) const { + size_t pidx_size = + IsBuild ? build_page_indices.size() : probe_page_indices.size(); + if (SPC_UNLIKELY(col_idx >= pidx_size)) { + // No page index prepared - use direct page read + return read_value_direct(column, row_id, data_type); + } const PageIndex &page_index = IsBuild ? build_page_indices[col_idx] : probe_page_indices[col_idx]; size_t page_num = page_index.find_page(row_id); @@ -407,6 +542,18 @@ class ColumnarReader { } } + /** + * @brief Direct value read bypassing page index cache. + * + * Used for deferred column resolution when reading from base tables + * that don't have prepared page indices. O(n) page scan per read. + */ + inline mema::value_t read_value_direct_public(const Column &column, + uint32_t row_id, + DataType data_type) const { + return read_value_direct(column, row_id, data_type); + } + inline const PageIndex &get_build_page_index(size_t col_idx) const { return build_page_indices[col_idx]; } @@ -428,6 +575,89 @@ class ColumnarReader { return reinterpret_cast(page_data + 4)[local_row]; } + /** + * @brief Direct value read without prepared page index. + * + * Used when page indices aren't available (e.g., reading base tables + * during deferred resolution). O(n) page scan - slower than cached path. + */ + inline mema::value_t read_value_direct(const Column &column, + uint32_t row_id, + DataType data_type) const { + // Linear scan to find page containing row_id + uint32_t cumulative = 0; + for (size_t page_num = 0; page_num < column.pages.size(); ++page_num) { + auto *page_data = column.pages[page_num]->data; + auto num_rows = *reinterpret_cast(page_data); + auto num_values = + *reinterpret_cast(page_data + 2); + + // Handle special pages + if (num_rows == 0xffff) { + // Long string page - single row + if (row_id == cumulative) { + return mema::value_t::encode_string( + static_cast(page_num), + mema::value_t::LONG_STRING_OFFSET); + } + cumulative += 1; + continue; + } + if (num_rows == 0xfffe) { + // Skip special marker pages + continue; + } + + if (row_id < cumulative + num_rows) { + // Found the page + uint32_t local_row = row_id - cumulative; + bool is_dense = (num_rows == num_values); + const auto *data_ptr = + reinterpret_cast(page_data + 4); + + if (is_dense) { + if (data_type == DataType::INT32) { + return mema::value_t{data_ptr[local_row]}; + } else { + return mema::value_t::encode_string( + static_cast(page_num), + static_cast(local_row)); + } + } else { + // Sparse page - check bitmap + size_t bitmap_size = (num_rows + 7) / 8; + const auto *bitmap_ptr = reinterpret_cast( + page_data + PAGE_SIZE - bitmap_size); + + bool is_valid = + bitmap_ptr[local_row >> 3] & (1u << (local_row & 7)); + if (!is_valid) { + return mema::value_t{mema::value_t::NULL_VALUE}; + } + + // Compute data index via popcount + uint32_t data_idx = 0; + for (uint32_t i = 0; i < local_row; ++i) { + if (bitmap_ptr[i >> 3] & (1u << (i & 7))) { + data_idx++; + } + } + + if (data_type == DataType::INT32) { + return mema::value_t{data_ptr[data_idx]}; + } else { + return mema::value_t::encode_string( + static_cast(page_num), + static_cast(data_idx)); + } + } + } + cumulative += num_rows; + } + // Row not found - return NULL + return mema::value_t{mema::value_t::NULL_VALUE}; + } + /** @brief Reads from sparse pages using bitmap and popcount. */ inline mema::value_t read_sparse(uint32_t local_row, DataType data_type, const Cursor &cursor) const { @@ -452,8 +682,48 @@ class ColumnarReader { static_cast(data_idx)); } } + + /** @brief Load page into cursor for base table access. */ + inline void load_page_into_cursor_base(const Column &column, + const PageIndex &page_index, + size_t page_num, size_t col_idx, + uint64_t version, + Cursor &cursor) const { + cursor.version = version; + cursor.cached_col = col_idx; + cursor.cached_page = page_num; + cursor.cached_start = page_index.page_start_row(page_num); + cursor.cached_end = page_index.cumulative_rows[page_num]; + cursor.page_idx_val = static_cast(page_num); + cursor.col_all_dense = page_index.all_dense; + + auto *page_data = column.pages[page_num]->data; + auto num_rows = *reinterpret_cast(page_data); + auto num_values = *reinterpret_cast(page_data + 2); + + cursor.is_special = (num_rows == 0xffff); + cursor.is_dense = (num_rows == num_values); + cursor.data_ptr = reinterpret_cast(page_data + 4); + + if (!cursor.is_dense && !cursor.is_special) { + size_t bitmap_size = (num_rows + 7) / 8; + cursor.bitmap_ptr = reinterpret_cast( + page_data + PAGE_SIZE - bitmap_size); + cursor.prefix_sum_ptr = + page_index.page_prefix_sums[page_num].data(); + } + } + std::vector build_page_indices; std::vector probe_page_indices; + + // Base table page indices for deferred column resolution. + // Index = (table_id << 4) | col_idx, supports 16 tables Ɨ 16 cols = 256. + static constexpr size_t BASE_TABLE_SHIFT = 4; + static constexpr size_t MAX_BASE_TABLE_INDICES = 256; + std::array base_table_indices_; + std::array base_table_prepared_{}; + uint64_t base_table_version_ = 0; }; } // namespace Contest::io diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h new file mode 100644 index 0000000..13be4dd --- /dev/null +++ b/include/data_model/deferred_plan.h @@ -0,0 +1,142 @@ +/** + * @file deferred_plan.h + * @brief Analyzed plan with materialization decisions for execution. + * + * AnalyzedPlan mirrors the original Plan structure but includes pre-computed + * decisions about which columns to materialize eagerly (join keys) vs defer + * until final output. Each AnalyzedJoinNode tracks column provenance back to + * base tables for efficient deferred resolution. + * + * @see analyze_plan.cpp for the analysis algorithm. + * @see intermediate.h for the runtime result format. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace Contest { + +/** + * @brief Materialization decision for an output column. + * + * MATERIALIZE: Column is needed as a join key by parent - materialize eagerly. + * DEFER: Column only needed at final output - defer until root materialization. + */ +enum class ColumnResolution : uint8_t { MATERIALIZE, DEFER }; + +/** + * @brief Tracks the base table origin of a column for deferred resolution. + * + * Used to resolve deferred columns at final materialization by looking up + * the original value in the base table using row ID provenance. + */ +struct ColumnProvenance { + uint8_t base_table_id; ///< Index into Plan::inputs. + uint8_t base_column_idx; ///< Column index within the base table. +}; + +/** + * @brief Complete metadata for an output column in a join. + * + * Combines materialization decision, provenance tracking, and child source + * information for efficient intermediate construction and final resolution. + */ +struct AnalyzedColumnInfo { + size_t original_idx; ///< Index in node's output_attrs. + DataType type; ///< INT32 or VARCHAR. + + ColumnResolution resolution; ///< MATERIALIZE or DEFER. + ColumnProvenance provenance; ///< Base table source for deferred resolution. + + bool from_left; ///< True if from left child, false if right. + size_t child_output_idx; ///< Index in child's output_attrs. +}; + +/** + * @brief Analyzed scan node for execution. + * + * Wraps a ScanNode with output attribute information. + */ +struct AnalyzedScanNode { + size_t node_idx; ///< Index in original Plan::nodes. + uint8_t base_table_id; ///< Index into Plan::inputs. + std::vector> output_attrs; ///< Projected cols. +}; + +/** + * @brief Analyzed join node with pre-computed materialization decisions. + * + * Contains all information needed for execution: + * - Which columns to materialize eagerly (join keys for parent) + * - Column provenance for deferred resolution + * - Pre-computed match collection mode + * - Number of deferred columns for allocation + */ +struct AnalyzedJoinNode { + size_t node_idx; ///< Index in original Plan::nodes. + + size_t left_child_idx; ///< Left child index in Plan::nodes. + size_t right_child_idx; ///< Right child index in Plan::nodes. + size_t left_join_attr; ///< Join key index in left child's output. + size_t right_join_attr; ///< Join key index in right child's output. + + /// Original output attributes (global indexing). + std::vector> output_attrs; + + /// Per-column materialization decisions and provenance. + std::vector columns; + + /// Pre-computed collection mode (assumes build=left; flip if build=right). + join::MatchCollectionMode base_collection_mode; + + /// Number of deferred columns (for pre-allocation). + size_t num_deferred_columns = 0; + + /// Column index that parent needs as join key (nullopt if root). + std::optional parent_join_key_idx; + + /// True if this is the root node. + bool is_root; +}; + +/** + * @brief Plan node variant for execution. + */ +using AnalyzedNode = std::variant; + +/** + * @brief Analyzed plan with materialization decisions. + * + * Mirrors Plan structure but includes pre-computed decisions for deferred + * materialization. The original_plan pointer provides access to base tables + * for value resolution. + */ +struct AnalyzedPlan { + std::vector nodes; ///< Analyzed nodes (same indices as Plan). + size_t root; ///< Root node index. + const Plan *original_plan; ///< Non-owning reference to original plan. + + const AnalyzedNode &operator[](size_t idx) const { return nodes[idx]; } +}; + +/** + * @brief Analyze plan and compute materialization decisions. + * + * Walks the plan tree in post-order, determining for each join node: + * 1. Which column the parent needs as join key (MATERIALIZE) + * 2. All other columns (DEFER) + * 3. Provenance for each column back to base table + * 4. Pre-computed collection mode based on output columns + * + * @param plan Original query plan. + * @return AnalyzedPlan with materialization decisions. + */ +AnalyzedPlan analyze_plan(const Plan &plan); + +} // namespace Contest diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h index e0e2667..e8cefe7 100644 --- a/include/data_model/intermediate.h +++ b/include/data_model/intermediate.h @@ -1,32 +1,174 @@ /** * @file intermediate.h - * @brief Intermediate join format: VARCHAR as page/offset refs (no string - * copy). + * @brief Intermediate join result types and input abstraction. * - * Base tables must outlive execution. @see plan.h ColumnarTable, - * construct_intermediate.h + * Provides: + * - mema::value_t: 4-byte value encoding (INT32 direct, VARCHAR as page/offset) + * - mema::column_t: 16KB-paged column for materialized values + * - mema::DeferredTable: 16KB-paged 32-bit row ID storage per base table + * - IntermediateResult: Lightweight result with selective materialization + * - JoinInput: Unified abstraction over columnar tables and intermediate + * results + * + * Base tables must outlive execution. + * + * @see plan.h for ColumnarTable, construct_intermediate.h for building results. + * @see deferred_plan.h for AnalyzedJoinNode with column decisions. */ #pragma once #include #include +#include #include #include +#include #include +#include #include /** * @namespace mema - * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages). + * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages) + + * DeferredTable (32-bit row IDs) + key_row_column_t (8B tuples). * * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated - * pages with write_at(). @see Contest::ExecuteResult, plan.h ColumnarTable. + * pages with write_at(). DeferredTable: 32-bit row ID storage per base table. + * key_row_column_t: (key, row_id) tuples for join key propagation. + * + * @see Contest::IntermediateResult, plan.h ColumnarTable. */ namespace mema { +/** + * @brief Join key with associated row ID for tuple-based storage. + * + * For LEFT_ONLY/RIGHT_ONLY modes: row_id is base table row ID (zero + * indirection) For BOTH mode: row_id may be IR index (requires deferred table + * lookup) + * + * 8-byte aligned for efficient memory access and potential SIMD operations. + */ +struct alignas(8) KeyRowPair { + int32_t key; ///< Join key value + uint32_t row_id; ///< Row ID (base table or IR index depending on mode) +}; + +/** + * + * @brief Column of (key, row_id) tuples for join key storage. + * + * Enables accelerated hashtable build (tuples match internal format) and + * zero-indirection row ID propagation through join chains. Used instead of + * separate column_t for join key columns. + * + * Memory layout: 16KB pages containing 2048 KeyRowPair entries each. + * + **/ +struct key_row_column_t { + static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB + static constexpr size_t PAIRS_PER_PAGE = + PAGE_SIZE / sizeof(KeyRowPair); // 2048 + static constexpr size_t ENTRY_SHIFT = 11; // log2(2048) + static constexpr size_t ENTRY_MASK = PAIRS_PER_PAGE - 1; + + struct alignas(PAGE_SIZE) Page { + KeyRowPair data[PAIRS_PER_PAGE]; + }; + + std::vector pages; + size_t num_values = 0; + + /// Base table ID for row_id component (valid when stores_base_row_ids=true) + uint8_t base_table_id = 0; + + /// Source column in base table (for VARCHAR provenance) + uint8_t source_column = 0; + + /// True if row_id contains base table row IDs, false if IR indices + bool stores_base_row_ids = false; + + key_row_column_t() = default; + + key_row_column_t(key_row_column_t &&other) noexcept + : pages(std::move(other.pages)), num_values(other.num_values), + base_table_id(other.base_table_id), + source_column(other.source_column), + stores_base_row_ids(other.stores_base_row_ids) { + other.pages.clear(); + other.num_values = 0; + } + + key_row_column_t &operator=(key_row_column_t &&other) noexcept { + if (this != &other) { + pages = std::move(other.pages); + num_values = other.num_values; + base_table_id = other.base_table_id; + source_column = other.source_column; + stores_base_row_ids = other.stores_base_row_ids; + other.pages.clear(); + other.num_values = 0; + } + return *this; + } + + key_row_column_t(const key_row_column_t &) = delete; + key_row_column_t &operator=(const key_row_column_t &) = delete; + + ~key_row_column_t() = default; + + /// O(1) read: idx >> 11 for page, idx & 0x7FF for offset + inline KeyRowPair operator[](size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK]; + } + + /// Thread-safe write at idx (requires pages set up first) + inline void write_at(size_t idx, KeyRowPair pair) { + pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = pair; + } + + /// Read only the key at index + inline int32_t key_at(size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].key; + } + + /// Read only the row_id at index + inline uint32_t row_id_at(size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].row_id; + } + + size_t row_count() const { return num_values; } + void set_row_count(size_t count) { num_values = count; } + + /// Release ownership of pages for zero-copy transfer to hashtable. + /// After this call, the column is empty (pages cleared, num_values = 0). + /// @return Vector of page pointers (caller takes ownership). + std::vector release_pages() && { + std::vector released = std::move(pages); + pages.clear(); + num_values = 0; + return released; + } + + /// Pre-allocate pages from arena + inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena, + size_t count) { + size_t pages_needed = (count + PAIRS_PER_PAGE - 1) / PAIRS_PER_PAGE; + pages.reserve(pages_needed); + for (size_t i = 0; i < pages_needed; ++i) { + void *ptr = + arena.alloc_chunk(); + pages.push_back(reinterpret_cast(ptr)); + } + num_values = count; + } +}; + /** * @brief 4-byte value: INT32 direct, VARCHAR packed (19-bit page + 13-bit - * offset), NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while + * offset). + * + * NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while * source exists. */ struct alignas(4) value_t { @@ -45,16 +187,18 @@ struct alignas(4) value_t { offset_idx = (static_cast(encoded) >> 19) & 0x1FFF; } - static constexpr int32_t LONG_STRING_OFFSET = - 0x1FFF; /**< Sentinel for long strings. */ - static constexpr int32_t NULL_VALUE = - INT32_MIN; /**< NULL sentinel for both types. */ + /** @brief Sentinel for long strings. */ + static constexpr int32_t LONG_STRING_OFFSET = 0x1FFF; + + /** @brief NULL sentinel for both types. */ + static constexpr int32_t NULL_VALUE = INT32_MIN; /** @brief Check if this value represents NULL. */ inline bool is_null() const { return value == NULL_VALUE; } }; -/** @brief Page size for intermediate results (16KB, larger than ColumnarTable). +/** + * @brief Page size for intermediate results (16KB, larger than ColumnarTable). */ constexpr size_t IR_PAGE_SIZE = 1 << 14; @@ -82,9 +226,12 @@ struct column_t { public: std::vector pages; /**< Pointers to arena-allocated pages. */ - uint8_t source_table = - 0; /**< Base table index for VARCHAR dereferencing. */ - uint8_t source_column = 0; /**< Column index within source table. */ + + /** @brief Base table index for VARCHAR dereferencing. */ + uint8_t source_table = 0; + + /** @brief Column index within source table. */ + uint8_t source_column = 0; public: column_t() = default; @@ -114,8 +261,10 @@ struct column_t { ~column_t() = default; - /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. No bounds - * check. */ + /** + * @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. + * @note No bounds check. + */ inline const value_t &operator[](size_t idx) const { return pages[idx >> 12]->data[idx & 0xFFF]; } @@ -153,14 +302,314 @@ struct column_t { using Columnar = std::vector; /** - * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs. - * @see materialize.h + * @brief Per-base-table deferred row ID storage with multi-column tracking. + * + * Stores 32-bit row IDs for a single base table. All columns from this + * base table share the same row ID lookup, reducing memory from 8 bytes + * per column to 4 bytes per table. + * + * Uses 16KB pages (reuses IR_PAGE arena chunk) with 4096 uint32_t entries. */ -ColumnarTable to_columnar(const Columnar &table, const Plan &plan); -} /* namespace mema */ +struct DeferredTable { + static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB + static constexpr size_t ENTRIES_PER_PAGE = + PAGE_SIZE / sizeof(uint32_t); // 4096 + static constexpr size_t ENTRY_SHIFT = 12; // log2(4096) + static constexpr size_t ENTRY_MASK = ENTRIES_PER_PAGE - 1; + + struct alignas(PAGE_SIZE) Page { + uint32_t data[ENTRIES_PER_PAGE]; + }; + + std::vector pages; + size_t num_values = 0; + + /// Base table ID this deferred table references + uint8_t base_table_id = 0; + + /// True if this deferred table comes from build side (vs probe) + bool from_build = false; + + /// Column indices from this base table that need deferred resolution + std::vector column_indices; + + DeferredTable() = default; + + DeferredTable(DeferredTable &&other) noexcept + : pages(std::move(other.pages)), num_values(other.num_values), + base_table_id(other.base_table_id), from_build(other.from_build), + column_indices(std::move(other.column_indices)) { + other.pages.clear(); + other.num_values = 0; + } + + DeferredTable &operator=(DeferredTable &&other) noexcept { + if (this != &other) { + pages = std::move(other.pages); + num_values = other.num_values; + base_table_id = other.base_table_id; + from_build = other.from_build; + column_indices = std::move(other.column_indices); + other.pages.clear(); + other.num_values = 0; + } + return *this; + } + + DeferredTable(const DeferredTable &) = delete; + DeferredTable &operator=(const DeferredTable &) = delete; + + ~DeferredTable() = default; + + /// O(1) read: idx >> 12 for page, idx & 0xFFF for offset + inline uint32_t operator[](size_t idx) const { + return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK]; + } + + /// Thread-safe write at idx (requires pages set up first) + inline void write_at(size_t idx, uint32_t row_id) { + pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = row_id; + } + + size_t row_count() const { return num_values; } + void set_row_count(size_t count) { num_values = count; } + + /// Check if this table tracks a specific base column + bool has_column(uint8_t col_idx) const { + for (uint8_t c : column_indices) { + if (c == col_idx) + return true; + } + return false; + } +}; + +} // namespace mema -/** @namespace Contest @brief Contest API. @see Plan, execute.cpp */ namespace Contest { -/** @brief Result type for non-root joins (intermediate format). */ -using ExecuteResult = std::vector; -} /* namespace Contest */ + +/** + * @brief Reference from a column to its deferred table. + */ +struct DeferredColumnRef { + uint8_t table_idx; ///< Index into IntermediateResult::deferred_tables + uint8_t base_col; ///< Base column index in Plan::inputs[base_table_id] +}; + +/** + * @brief Lightweight intermediate result with selective materialization. + * + * Stores join key as (value, row_id) tuples for accelerated hashtable build + * and zero-indirection row ID propagation. Other columns use per-table 32-bit + * row ID storage for deferred resolution. + * + * For LEFT_ONLY/RIGHT_ONLY modes: join_key_tuples stores base table row IDs + * For BOTH mode: join_key_tuples may store IR indices + DeferredTable for other + * side + * + * @see AnalyzedColumnInfo for materialization decisions. + * @see key_row_column_t for tuple storage. + * @see DeferredTable for 32-bit row ID storage. + */ +struct IntermediateResult { + /// Join key stored as (value, row_id) tuples for accelerated propagation. + /// Replaces materialized column for join key when present. + std::optional join_key_tuples; + + /// Index of join key column in output (nullopt if root or no tuples). + std::optional join_key_idx; + + /// Other materialized columns (non-join-key columns marked MATERIALIZE). + std::vector materialized; + + /// Map: original column index -> index in materialized (nullopt if + /// deferred or is join key). + std::vector> materialized_map; + + /// Per-base-table deferred row ID storage. One DeferredTable per unique + /// (from_build, base_table_id) pair. All columns from same base table share + /// the same row ID lookup. Used for BOTH mode's non-tracked side. + std::vector deferred_tables; + + /// Map: original column index -> DeferredColumnRef (nullopt if + /// materialized). The ref contains table_idx (into deferred_tables) and + /// base_col for resolution. + std::vector> deferred_map; + + /// Reference to node info for column provenance resolution. + const AnalyzedJoinNode *node_info = nullptr; + + /// Total row count. + size_t num_rows = 0; + + IntermediateResult() = default; + IntermediateResult(IntermediateResult &&) = default; + IntermediateResult &operator=(IntermediateResult &&) = default; + IntermediateResult(const IntermediateResult &) = delete; + IntermediateResult &operator=(const IntermediateResult &) = delete; + + /** @brief Total row count. */ + size_t row_count() const { return num_rows; } + + /** @brief Check if join key is stored as tuples. */ + bool has_join_key_tuples() const { return join_key_tuples.has_value(); } + + /** @brief Check if join key tuples contain base row IDs (vs IR indices). */ + bool join_key_has_base_rows() const { + return join_key_tuples && join_key_tuples->stores_base_row_ids; + } + + /** @brief Get join key tuple at index. */ + mema::KeyRowPair get_join_key_tuple(size_t idx) const { + return join_key_tuples ? (*join_key_tuples)[idx] + : mema::KeyRowPair{0, 0}; + } + + /** @brief Check if column was materialized (not deferred). */ + bool is_materialized(size_t orig_idx) const { + return orig_idx < materialized_map.size() && + materialized_map[orig_idx].has_value(); + } + + /** @brief Check if column is the join key (stored as tuples). */ + bool is_join_key(size_t orig_idx) const { + return join_key_idx.has_value() && *join_key_idx == orig_idx; + } + + /** @brief Check if column is deferred. */ + bool is_deferred(size_t orig_idx) const { + return orig_idx < deferred_map.size() && + deferred_map[orig_idx].has_value(); + } + + /** @brief Get materialized column, or nullptr if deferred/join key. */ + const mema::column_t *get_materialized(size_t orig_idx) const { + if (!is_materialized(orig_idx)) + return nullptr; + return &materialized[*materialized_map[orig_idx]]; + } + + /** @brief Get deferred table for a column, or nullptr if materialized. */ + const mema::DeferredTable *get_deferred_table(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return nullptr; + return &deferred_tables[deferred_map[orig_idx]->table_idx]; + } + + /** @brief Get mutable deferred table for a column, or nullptr. */ + mema::DeferredTable *get_deferred_table_mut(size_t orig_idx) { + if (!is_deferred(orig_idx)) + return nullptr; + return &deferred_tables[deferred_map[orig_idx]->table_idx]; + } + + /** @brief Get base column index for deferred column. */ + uint8_t get_deferred_base_col(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return 0; + return deferred_map[orig_idx]->base_col; + } + + /** @brief Get full DeferredColumnRef for a column, or nullptr. */ + const DeferredColumnRef *get_deferred_ref(size_t orig_idx) const { + if (!is_deferred(orig_idx)) + return nullptr; + return &(*deferred_map[orig_idx]); + } + + /** @brief Number of deferred tables (unique base tables). */ + size_t num_deferred_tables() const { return deferred_tables.size(); } +}; + +/** + * @brief Unified abstraction over columnar tables and intermediate results. + * + * Stores ColumnarTable* (base scans) or IntermediateResult (child joins). + * Provides uniform interface for columnar (base table) and intermediate + * data sources. + * + * @see IntermediateResult for intermediate join results. + * @see ColumnarTable for base table storage. + */ +struct JoinInput { + /// Either base table pointer or owned IntermediateResult. + std::variant data; + + /// Original plan node for output_attrs mapping. + const PlanNode *node = nullptr; + + /// Analyzed plan node for materialization decisions. + const AnalyzedNode *analyzed_node = nullptr; + + /// Base table ID (for columnar inputs). + uint8_t table_id = 0; + + /** @brief True if data is columnar (base table). */ + bool is_columnar() const { + return std::holds_alternative(data); + } + + /** @brief Row count for join key column. */ + size_t row_count(size_t col_idx) const { + if (is_columnar()) { + const auto *table = std::get(data); + return table->num_rows; + } + return std::get(data).row_count(); + } + + /** @brief Total row count. */ + size_t row_count() const { + if (is_columnar()) { + const auto *table = std::get(data); + return table->num_rows; + } + return std::get(data).row_count(); + } + + /** @brief Number of output columns. */ + size_t output_size() const { + if (node) + return node->output_attrs.size(); + return 0; + } + + /** + * @brief Get deferred table for a column index. + * + * For columnar inputs, returns nullptr (caller must encode fresh). + * For IntermediateResult inputs, returns existing deferred table. + */ + const mema::DeferredTable *get_deferred_table(size_t col_idx) const { + if (is_columnar()) + return nullptr; + return std::get(data).get_deferred_table(col_idx); + } + + /** + * @brief Get base column index for a deferred column. + * + * For columnar inputs, returns 0 (caller must use column metadata). + * For IntermediateResult inputs, returns stored base column index. + */ + uint8_t get_deferred_base_col(size_t col_idx) const { + if (is_columnar()) + return 0; + return std::get(data).get_deferred_base_col( + col_idx); + } + + /** + * @brief Get full DeferredColumnRef for a column index. + * + * For columnar inputs, returns nullptr (caller must encode fresh). + * For IntermediateResult inputs, returns pointer to DeferredColumnRef. + */ + const DeferredColumnRef *get_deferred_ref(size_t col_idx) const { + if (is_columnar()) + return nullptr; + return std::get(data).get_deferred_ref(col_idx); + } +}; + +} // namespace Contest diff --git a/include/data_model/plan.h b/include/data_model/plan.h index 99c623e..897a8e2 100644 --- a/include/data_model/plan.h +++ b/include/data_model/plan.h @@ -33,7 +33,8 @@ #endif /** - * @brief RAII mmap wrapper with refcount. munmap on last ref release. Move-only. + * @brief RAII mmap wrapper with refcount. munmap on last ref release. + * Move-only. */ class MappedMemory { public: @@ -127,8 +128,8 @@ constexpr size_t PAGE_SIZE = 8192; * @brief 8-byte aligned page (8KB) for columnar data. * * INT32: [num_rows:u16][num_values:u16][values...][bitmap at end] - * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string bytes][bitmap] - * Long string markers: 0xFFFF (first), 0xFFFE (continuation). + * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string + * bytes][bitmap] Long string markers: 0xFFFF (first), 0xFFFE (continuation). * Dense page (no NULLs): num_rows == num_values → fast path. */ struct alignas(8) Page { @@ -219,7 +220,8 @@ struct Plan { size_t root; /**< Index of root node in nodes. */ /** - * @brief Create JoinNode. @return node index. Execution may override build_left. + * @brief Create JoinNode. @return node index. Execution may override + * build_left. */ size_t new_join_node(bool build_left, size_t left, size_t right, size_t left_attr, @@ -282,7 +284,8 @@ template struct ColumnInserter { bitmap.resize(PAGE_SIZE); } - /** @brief Get current page, allocating if needed. Does not advance index. */ + /** @brief Get current page, allocating if needed. Does not advance index. + */ std::byte *get_page() { if (last_page_idx == column.pages.size()) [[unlikely]] { column.new_page(); @@ -369,7 +372,8 @@ template <> struct ColumnInserter { bitmap.resize(PAGE_SIZE); } - /** @brief Get current page, allocating if needed. Does not advance index. */ + /** @brief Get current page, allocating if needed. Does not advance index. + */ std::byte *get_page() { if (last_page_idx == column.pages.size()) [[unlikely]] { column.new_page(); @@ -378,7 +382,8 @@ template <> struct ColumnInserter { return page->data; } - /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE markers. */ + /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE + * markers. */ void save_long_string(std::string_view value) { size_t offset = 0; auto first_page = true; @@ -484,6 +489,8 @@ struct TimingStats { int64_t setup_ms = 0; /**< JoinSetup + build/probe selection. */ int64_t total_execution_ms = 0; /**< Wall-clock total for execute(). */ int64_t intermediate_ms = 0; /**< construct_intermediate for non-root. */ + int64_t analyze_plan_ms = 0; /**< Deferred: plan analysis time. */ + int64_t deferred_resolve_ms = 0; /**< Deferred: column resolution time. */ }; /** @brief Allocate execution context (worker pool, shared state). */ diff --git a/include/foundation/common.h b/include/foundation/common.h index 16c8aa7..49967cd 100644 --- a/include/foundation/common.h +++ b/include/foundation/common.h @@ -125,7 +125,8 @@ class File { } }; -/** @brief Read entire file into string. @throws std::runtime_error on failure. */ +/** @brief Read entire file into string. @throws std::runtime_error on failure. + */ inline std::string read_file(const std::filesystem::path &path) { File f(path, "rb"); ::fseek(f, 0, SEEK_END); @@ -154,7 +155,8 @@ struct DSU { void unite(size_t x, size_t y) { pa[find(x)] = find(y); } }; -/** @brief Mark unreachable code path for compiler optimization (UB if reached). */ +/** @brief Mark unreachable code path for compiler optimization (UB if reached). + */ [[noreturn]] inline void unreachable() { // Uses compiler specific extensions if possible. // Even if no extension is used, undefined behavior is still raised by @@ -164,4 +166,90 @@ struct DSU { #else // GCC, Clang __builtin_unreachable(); #endif -} \ No newline at end of file +} + +namespace Contest { + +/** + * @brief Encoded global row ID: 5-bit table_id + 27-bit row_id. + * + * Supports up to 32 tables and 134M rows per table. + * Used to track original scan table rows through recursive joins. + * + * Encoding: [table_id (5 bits)][row_id (27 bits)] + * - table_id: bits 27-31 + * - row_id: bits 0-26 + */ +struct GlobalRowId { + static constexpr uint32_t TABLE_BITS = 5; + static constexpr uint32_t ROW_BITS = 27; + static constexpr uint32_t TABLE_SHIFT = ROW_BITS; + static constexpr uint32_t ROW_MASK = (1u << ROW_BITS) - 1; + static constexpr uint32_t MAX_TABLES = 1u << TABLE_BITS; // 32 + static constexpr uint32_t MAX_ROWS = 1u << ROW_BITS; // 134,217,728 + + /** @brief Encode table_id and row_id into a single uint32_t. */ + static inline uint32_t encode(uint8_t table_id, uint32_t row_id) { + return (static_cast(table_id) << TABLE_SHIFT) | + (row_id & ROW_MASK); + } + + /** @brief Extract table_id from encoded global row ID. */ + static inline uint8_t table(uint32_t encoded) { + return static_cast(encoded >> TABLE_SHIFT); + } + + /** @brief Extract row_id from encoded global row ID. */ + static inline uint32_t row(uint32_t encoded) { return encoded & ROW_MASK; } +}; + +/** + * @brief 64-bit encoding for deferred column provenance. + * + * Encodes table_id, column_idx, and row_id into a single 64-bit value + * for efficient storage and resolution of deferred columns. + * + * Encoding: [table_id (8 bits)][column_idx (8 bits)][row_id (48 bits)] + * - table_id: bits 56-63 + * - column_idx: bits 48-55 + * - row_id: bits 0-47 + * + * Supports up to 256 tables, 256 columns per table, and 281 trillion rows. + */ +struct DeferredProvenance { + static constexpr uint64_t ROW_BITS = 48; + static constexpr uint64_t COLUMN_BITS = 8; + static constexpr uint64_t TABLE_BITS = 8; + + static constexpr uint64_t ROW_MASK = (1ULL << ROW_BITS) - 1; + static constexpr uint64_t COLUMN_MASK = (1ULL << COLUMN_BITS) - 1; + static constexpr uint64_t COLUMN_SHIFT = ROW_BITS; + static constexpr uint64_t TABLE_SHIFT = ROW_BITS + COLUMN_BITS; + + static constexpr uint64_t MAX_TABLES = 1ULL << TABLE_BITS; // 256 + static constexpr uint64_t MAX_COLUMNS = 1ULL << COLUMN_BITS; // 256 + static constexpr uint64_t MAX_ROWS = 1ULL << ROW_BITS; // 281 trillion + + /** @brief Encode table_id, column_idx, row_id into single uint64_t. */ + static inline uint64_t encode(uint8_t table_id, uint8_t column_idx, + uint64_t row_id) { + return (static_cast(table_id) << TABLE_SHIFT) | + (static_cast(column_idx) << COLUMN_SHIFT) | + (row_id & ROW_MASK); + } + + /** @brief Extract table_id from encoded provenance. */ + static inline uint8_t table(uint64_t encoded) { + return static_cast(encoded >> TABLE_SHIFT); + } + + /** @brief Extract column_idx from encoded provenance. */ + static inline uint8_t column(uint64_t encoded) { + return static_cast((encoded >> COLUMN_SHIFT) & COLUMN_MASK); + } + + /** @brief Extract row_id from encoded provenance. */ + static inline uint64_t row(uint64_t encoded) { return encoded & ROW_MASK; } +}; + +} // namespace Contest \ No newline at end of file diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h index 0e2b777..f518df2 100644 --- a/include/join_execution/hash_join.h +++ b/include/join_execution/hash_join.h @@ -1,12 +1,3 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - /** * @file hash_join.h * @brief Hash join build and probe operations. @@ -19,6 +10,13 @@ * * @see hashtable.h, match_collector.h */ +#pragma once + +#include +#include +#include +#include +#include /** * @namespace Contest::join @@ -27,7 +25,6 @@ */ namespace Contest::join { -using Contest::ExecuteResult; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; @@ -52,16 +49,22 @@ inline UnchainedHashtable build_from_columnar(const JoinInput &input, /** * @brief Build hash table from intermediate results (column_t). * - * Uses join key column from ExecuteResult produced by prior pipeline stages. + * Uses join key column from IntermediateResult produced by prior pipeline + * stages. */ inline UnchainedHashtable build_from_intermediate(const JoinInput &input, size_t attr_idx) { - const auto &result = std::get(input.data); - const auto &column = result[attr_idx]; + const auto &result = std::get(input.data); + // Get the materialized column for the join key + const auto *column = result.get_materialized(attr_idx); + if (!column) { + // This should never happen - join keys must be materialized + std::abort(); + } size_t row_count = input.row_count(attr_idx); UnchainedHashtable hash_table(row_count); - hash_table.build_intermediate(column, 8); + hash_table.build_intermediate(*column, 8); return hash_table; } @@ -79,8 +82,7 @@ template inline std::vector> probe_intermediate(const UnchainedHashtable &hash_table, const mema::column_t &probe_column) { - const auto *keys = hash_table.keys(); - const auto *row_ids = hash_table.row_ids(); + const auto *entries = hash_table.entries(); size_t pool_size = THREAD_COUNT; std::vector> local_buffers(pool_size); @@ -121,8 +123,8 @@ probe_intermediate(const UnchainedHashtable &hash_table, hash_table.find_indices(key_val); for (uint64_t i = start_idx; i < end_idx; ++i) { - if (keys[i] == key_val) { - local_buf.add_match(row_ids[i], + if (entries[i].key == key_val) { + local_buf.add_match(entries[i].row_id, static_cast(idx)); } } @@ -148,8 +150,7 @@ inline std::vector> probe_columnar(const UnchainedHashtable &hash_table, const JoinInput &probe_input, size_t probe_attr) { - const auto *keys = hash_table.keys(); - const auto *row_ids = hash_table.row_ids(); + const auto *entries = hash_table.entries(); auto *table = std::get(probe_input.data); auto [actual_idx_col, _] = probe_input.node->output_attrs[probe_attr]; @@ -197,8 +198,9 @@ probe_columnar(const UnchainedHashtable &hash_table, hash_table.find_indices(key_val); for (uint64_t j = start_idx; j < end_idx; ++j) { - if (keys[j] == key_val) { - local_buf.add_match(row_ids[j], probe_row_id); + if (entries[j].key == key_val) { + local_buf.add_match(entries[j].row_id, + probe_row_id); } } probe_row_id++; @@ -219,8 +221,9 @@ probe_columnar(const UnchainedHashtable &hash_table, hash_table.find_indices(key_val); for (uint64_t j = start_idx; j < end_idx; ++j) { - if (keys[j] == key_val) { - local_buf.add_match(row_ids[j], probe_row_id); + if (entries[j].key == key_val) { + local_buf.add_match(entries[j].row_id, + probe_row_id); } } } @@ -233,4 +236,74 @@ probe_columnar(const UnchainedHashtable &hash_table, return local_buffers; } +/** + * @brief Probe hash table with tuple column, returning thread-local buffers. + * + * Uses (key, row_id) tuples from IntermediateResult. The row_id in each + * tuple is propagated to the match buffer, enabling zero-indirection + * resolution when tuples contain base table row IDs. + * + * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY) for compile-time + * specialization of match buffer operations. + * @param hash_table Hash table to probe against. + * @param probe_tuples Tuple column containing (key, row_id) pairs. + * @return Vector of thread-local match buffers. + */ +template +inline std::vector> +probe_tuples(const UnchainedHashtable &hash_table, + const mema::key_row_column_t &probe_tuples) { + + const auto *entries = hash_table.entries(); + const size_t probe_count = probe_tuples.row_count(); + const size_t num_pages = probe_tuples.pages.size(); + + std::vector> local_buffers(THREAD_COUNT); + std::atomic page_counter(0); + + worker_pool().execute([&](size_t thread_id) { + local_buffers[thread_id] = ThreadLocalMatchBuffer( + Contest::platform::get_arena(thread_id)); + auto &local_buf = local_buffers[thread_id]; + + while (true) { + size_t page_idx = page_counter.fetch_add(1); + if (page_idx >= num_pages) + break; + + size_t base = page_idx * mema::key_row_column_t::PAIRS_PER_PAGE; + size_t end = std::min(base + mema::key_row_column_t::PAIRS_PER_PAGE, + probe_count); + + constexpr size_t PREFETCH_DIST = 8; + for (size_t idx = base; idx < end; ++idx) { + // Prefetch future slot + if (idx + PREFETCH_DIST < end) { + hash_table.prefetch_slot( + probe_tuples.key_at(idx + PREFETCH_DIST)); + } + + mema::KeyRowPair pair = probe_tuples[idx]; + + // Skip NULL keys + if (pair.key != mema::value_t::NULL_VALUE) { + auto [start_idx, end_idx] = + hash_table.find_indices(pair.key); + + for (uint64_t i = start_idx; i < end_idx; ++i) { + if (entries[i].key == pair.key) { + // entries[i].row_id = build side's row ID (base or + // IR) pair.row_id = probe side's row ID (base or + // IR) + local_buf.add_match(entries[i].row_id, pair.row_id); + } + } + } + } + } + }); + + return local_buffers; +} + } // namespace Contest::join diff --git a/include/join_execution/hashtable.h b/include/join_execution/hashtable.h index f98ea18..a39c3aa 100644 --- a/include/join_execution/hashtable.h +++ b/include/join_execution/hashtable.h @@ -57,12 +57,18 @@ using Contest::join::BLOOM_TAGS; */ class UnchainedHashtable { public: - /** @brief Key-rowid pair for hash table entries. */ + /** @brief Key-rowid pair for hash table entries (build phase). */ struct alignas(4) Tuple { int32_t key; /**< Join key value. */ uint32_t row_id; /**< Row index in source table. */ }; + /** @brief Fused key-rowid for cache-friendly probe (8-byte aligned). */ + struct alignas(8) Entry { + int32_t key; /**< Join key value. */ + uint32_t row_id; /**< Row index in source table. */ + }; + /** @brief L2-sized chunk for partition buffers. */ static constexpr size_t CHUNK_SIZE = 4096; static constexpr size_t CHUNK_HEADER = 16; @@ -126,28 +132,11 @@ class UnchainedHashtable { nullptr; /**< Arena for hash table allocations. */ Contest::platform::ArenaVector directory; /**< Slot entries: (end_offset << 16) | bloom_tag. */ - Contest::platform::ArenaVector - keys_; /**< Contiguous key storage, indexed by directory. */ - Contest::platform::ArenaVector - row_ids_; /**< Parallel row_id storage, same indexing. */ + Contest::platform::ArenaVector + entries_; /**< Fused key+row_id storage, indexed by directory. */ int shift = 0; /**< Bit shift for slot calculation: slot = hash >> (64-shift). */ - /** - * @brief CRC32-based hash with multiplicative mixing. - * @param key INT32 join key. - * @return 64-bit hash (upper bits index directory slot). - */ - static uint64_t hash_key(int32_t key) noexcept { - constexpr uint64_t k = 0x8648DBDB; -#if defined(__aarch64__) - uint32_t crc = __crc32w(0, static_cast(key)); -#else - uint32_t crc = _mm_crc32_u32(0, static_cast(key)); -#endif - return crc * ((k << 32) + 1); - } - /** * @brief Returns bloom tag from hash. Uses bits 32-42 to index BLOOM_TAGS. * @see bloom_tags.h @@ -231,8 +220,7 @@ class UnchainedHashtable { uint64_t h = hash_key(tup.key); size_t local_slot = slot_for(h) - slot_start; uint32_t idx = offsets[local_slot] + counts[local_slot]++; - keys_[idx] = tup.key; - row_ids_[idx] = tup.row_id; + entries_[idx] = {tup.key, tup.row_id}; directory[slot_start + local_slot] |= bloom_tag(h); } } @@ -253,7 +241,7 @@ class UnchainedHashtable { */ explicit UnchainedHashtable(size_t build_size) : arena_(&Contest::platform::get_arena(0)), directory(*arena_), - keys_(*arena_), row_ids_(*arena_) { + entries_(*arena_) { size_t pow2 = 2048; while (pow2 < build_size) pow2 <<= 1; @@ -262,17 +250,29 @@ class UnchainedHashtable { shift = __builtin_ctzll(pow2); } - /** @brief Number of keys in the hash table. */ - size_t size() const noexcept { return keys_.size(); } + /** @brief Number of entries in the hash table. */ + size_t size() const noexcept { return entries_.size(); } /** @brief True if hash table is empty. */ - bool empty() const noexcept { return keys_.empty(); } + bool empty() const noexcept { return entries_.empty(); } - /** @brief Direct access to key array for probe. */ - const int32_t *keys() const noexcept { return keys_.data(); } + /** @brief Direct access to fused entry array for probe. */ + const Entry *entries() const noexcept { return entries_.data(); } - /** @brief Direct access to row_id array for probe. */ - const uint32_t *row_ids() const noexcept { return row_ids_.data(); } + /** + * @brief CRC32-based hash with multiplicative mixing. Public for pre-hash. + * @param key INT32 join key. + * @return 64-bit hash (upper bits index directory slot). + */ + static uint64_t hash_key(int32_t key) noexcept { + constexpr uint64_t k = 0x8648DBDB; +#if defined(__aarch64__) + uint32_t crc = __crc32w(0, static_cast(key)); +#else + uint32_t crc = _mm_crc32_u32(0, static_cast(key)); +#endif + return crc * ((k << 32) + 1); + } /** * @brief Prefetch directory slot for a key to hide memory latency. @@ -286,13 +286,24 @@ class UnchainedHashtable { __builtin_prefetch(&directory[slot], 0, 2); } + /** + * @brief Prefetch directory slot using pre-computed hash. + * + * Avoids recomputing hash when already computed for another purpose. + * @param h Pre-computed hash from hash_key(). + */ + void prefetch_slot_prehashed(uint64_t h) const noexcept { + size_t slot = slot_for(h); + __builtin_prefetch(&directory[slot], 0, 2); + } + /** * @brief Find index range for keys matching probe key. * - * @return [start, end) into keys_/row_ids_; (0,0) if bloom rejects. + * @return [start, end) into entries_; (0,0) if bloom rejects. */ std::pair find_indices(int32_t key) const noexcept { - if (keys_.empty()) + if (entries_.empty()) return {0, 0}; uint64_t h = hash_key(key); @@ -308,6 +319,32 @@ class UnchainedHashtable { return {start, end}; } + /** + * @brief Find index range using pre-computed hash (avoids rehashing). + * + * Use when hash was already computed for prefetch or bloom filter check. + * @param key Original key (for comparison in caller). + * @param h Pre-computed hash from hash_key(key). + * @return [start, end) into entries_; (0,0) if bloom rejects. + */ + std::pair + find_indices_prehashed(int32_t key, uint64_t h) const noexcept { + (void)key; // Key used by caller for comparison, not needed here + if (entries_.empty()) + return {0, 0}; + + size_t slot = slot_for(h); + uint64_t entry = directory[slot]; + uint16_t tag = bloom_tag(h); + + if ((entry & tag) != tag) + return {0, 0}; + + uint64_t end = entry >> 16; + uint64_t start = (slot == 0) ? 0 : (directory[slot - 1] >> 16); + return {start, end}; + } + /** * @brief Build hash table from intermediate column_t. * @@ -376,8 +413,7 @@ class UnchainedHashtable { size_t total = global_offsets[num_partitions]; if (total == 0) return; - keys_.resize(total); - row_ids_.resize(total); + entries_.resize(total); // Build partitions in parallel const int nt = num_threads; @@ -390,6 +426,92 @@ class UnchainedHashtable { }); } + /** + * @brief Build hash table from (key, row_id) tuple column. + * + * Radix-partitioned parallel build from key_row_column_t. + * Uses page-based work distribution for better cache locality. + * Each thread processes whole pages to avoid cross-page access. + * + * @param tuples Key-row tuple column from IntermediateResult. + * @param num_threads Thread count hint (unused, uses pool size). + */ + void build_from_tuples(const mema::key_row_column_t &tuples, + int /*num_threads*/ = 4) { + const size_t row_count = tuples.row_count(); + if (row_count == 0) + return; + + const int pool_threads = Contest::platform::worker_pool().thread_count(); + const size_t num_slots = directory.size(); + const size_t num_partitions = + compute_num_partitions(row_count, pool_threads); + const int partition_bits = __builtin_ctzll(num_partitions); + const size_t slots_per_partition = num_slots / num_partitions; + + // Thread-local partitions for lock-free parallel partitioning + std::vector allocators(pool_threads); + for (int t = 0; t < pool_threads; ++t) + allocators[t].set_arena(Contest::platform::get_arena(t)); + std::vector> thread_parts(pool_threads); + for (auto &tp : thread_parts) + tp.resize(num_partitions); + + // Page-based partition phase - each thread processes whole pages + constexpr size_t PAIRS_PER_PAGE = mema::key_row_column_t::PAIRS_PER_PAGE; + const size_t num_pages = tuples.pages.size(); + + Contest::platform::worker_pool().execute( + [&, partition_bits, pool_threads](size_t t) { + const int shift = 64 - partition_bits; + const size_t stride = static_cast(pool_threads); + for (size_t pg = t; pg < num_pages; pg += stride) { + // Prefetch next page + if (pg + stride < num_pages) { + __builtin_prefetch(tuples.pages[pg + stride]->data, 0, 3); + } + const auto *page_data = tuples.pages[pg]->data; + size_t base = pg * PAIRS_PER_PAGE; + size_t count = std::min(PAIRS_PER_PAGE, row_count - base); + for (size_t i = 0; i < count; ++i) { + const auto &pair = page_data[i]; + uint64_t h = hash_key(pair.key); + size_t p = (partition_bits == 0) ? 0 : (h >> shift); + thread_parts[t][p].append(allocators[t], + {pair.key, pair.row_id}); + } + } + }); + + // Compute global offsets from per-thread counts + Contest::platform::ArenaVector global_offsets(*arena_); + global_offsets.resize(num_partitions + 1); + std::memset(global_offsets.data(), 0, + (num_partitions + 1) * sizeof(size_t)); + for (size_t p = 0; p < num_partitions; ++p) { + for (int t = 0; t < pool_threads; ++t) { + global_offsets[p + 1] += thread_parts[t][p].total_count; + } + global_offsets[p + 1] += global_offsets[p]; + } + + size_t total = global_offsets[num_partitions]; + if (total == 0) + return; + entries_.resize(total); + + // Build partitions in parallel + Contest::platform::worker_pool().execute([&, pool_threads](size_t t) { + for (size_t p = t; p < num_partitions; + p += static_cast(pool_threads)) { + build_partition(thread_parts, p, slots_per_partition, + global_offsets[p], + global_offsets[p + 1] - global_offsets[p], + pool_threads, t); + } + }); + } + /** * @brief Build hash table from ColumnarTable Column. * @@ -489,8 +611,7 @@ class UnchainedHashtable { size_t total = global_offsets[num_partitions]; if (total == 0) return; - keys_.resize(total); - row_ids_.resize(total); + entries_.resize(total); const int nt = num_threads; Contest::platform::worker_pool().execute([&, nt](size_t t) { diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h index 299dd65..f2917f0 100644 --- a/include/join_execution/join_setup.h +++ b/include/join_execution/join_setup.h @@ -1,63 +1,24 @@ /** * @file join_setup.h - * @brief Join configuration and input abstraction. + * @brief Join configuration and build/probe side selection. * - * Provides JoinInput to abstract over columnar and intermediate data sources, - * and utilities for selecting build/probe sides and preparing output columns. + * Provides utilities for selecting build/probe sides and determining + * which row IDs to collect based on output columns. */ #pragma once -#include #include #include #include #include -#include #include /** * @namespace Contest::join - * @brief JoinInput abstraction, build/probe selection, output column setup. + * @brief Build/probe selection and collection mode determination. */ namespace Contest::join { -using Contest::ExecuteResult; -using Contest::io::ColumnarReader; - -/** - * @brief Unified abstraction over columnar tables and intermediate results. - * - * Stores ColumnarTable* (base scans) or ExecuteResult (child joins). Node - * provides output_attrs mapping for column resolution. - */ -struct JoinInput { - std::variant data; - const PlanNode *node; /**< Provides output_attrs for column mapping. */ - uint8_t table_id; /**< Source table ID for provenance tracking. */ - - /** @brief True if data is columnar (base table), false if intermediate. */ - bool is_columnar() const { - return std::holds_alternative(data); - } - - /** - * @brief Row count for a given output column. - * @param col_idx Index into node->output_attrs. - */ - size_t row_count(size_t col_idx) const { - if (is_columnar()) { - auto *table = std::get(data); - auto [actual_col_idx, _] = node->output_attrs[col_idx]; - return table->num_rows; - } else { - return std::get(data)[col_idx].row_count(); - } - } - - /** @brief Number of output columns. */ - size_t output_size() const { return node->output_attrs.size(); } -}; - /** * @brief Configuration for build/probe side assignment. * @@ -75,17 +36,6 @@ struct BuildProbeConfig { size_t probe_attr; /**< Join key index in probe's output_attrs. */ }; -/** @brief Resolves global output column index to source input. */ -inline std::tuple -resolve_input_source(size_t global_idx, size_t split_point, - const JoinInput &input_a, const PlanNode &node_a, - const JoinInput &input_b, const PlanNode &node_b) { - if (global_idx < split_point) { - return {input_a, node_a, global_idx}; - } - return {input_b, node_b, global_idx - split_point}; -} - /** * @brief Chooses build/probe sides based on cardinality. * @@ -156,153 +106,63 @@ inline MatchCollectionMode determine_collection_mode( return MatchCollectionMode::BOTH; } -/** - * @brief Creates output columns with provenance metadata from inputs. - */ -inline ExecuteResult initialize_output_columns( - const std::vector> &output_attrs, - const PlanNode &left_node, const PlanNode &right_node, - const JoinInput &left_input, const JoinInput &right_input, - size_t estimated_rows) { - ExecuteResult results; - results.reserve(output_attrs.size()); - size_t left_size = left_input.output_size(); - - auto set_column_metadata = [](mema::column_t &col, const JoinInput &input, - const PlanNode &node, size_t col_idx) { - auto [actual_col_idx, _] = node.output_attrs[col_idx]; - if (input.is_columnar()) { - col.source_table = input.table_id; - col.source_column = actual_col_idx; - } else { - const auto &result = std::get(input.data); - col.source_table = result[col_idx].source_table; - col.source_column = result[col_idx].source_column; - } - }; - - for (size_t i = 0; i < output_attrs.size(); ++i) { - auto [col_idx, _] = output_attrs[i]; - auto [input, node, local_idx] = resolve_input_source( - col_idx, left_size, left_input, left_node, right_input, right_node); +} // namespace Contest::join - mema::column_t col; - set_column_metadata(col, input, node, local_idx); - results.push_back(std::move(col)); - } +namespace Contest { - return results; -} +// Forward declare AnalyzedJoinNode +struct AnalyzedJoinNode; /** - * @brief Join output state and columnar reader. + * @brief Tracking info for one side of a join (build or probe). * - * prepared flag implements lazy PageIndex construction. + * Determines whether to embed base table row IDs or IR indices in the + * output tuples for this side. */ -struct JoinSetup { - ExecuteResult results; /**< Output columns being populated. */ - ColumnarReader - columnar_reader; /**< Page cursor caching for columnar access. */ - /** - * True after prepare_output_columns called. - */ - bool prepared; - - JoinSetup() : prepared(false) {} +struct SideTrackingInfo { + bool track_base_rows = + false; ///< True to embed base row IDs, false for IR indices + uint8_t base_table_id = 0; ///< Base table to track (if track_base_rows) }; /** - * @brief Initializes JoinSetup with output columns; call before join execution. + * @brief Tracking configuration for intermediate construction. * - * PageIndex construction deferred to prepare_output_columns(). + * Determines what row IDs to embed in join key tuples and whether + * DeferredTables are needed for non-tracked sides. */ -inline JoinSetup -setup_join(const JoinInput &build_input, const JoinInput &probe_input, - const PlanNode &build_node, const PlanNode &probe_node, - const PlanNode &left_node, const PlanNode &right_node, - const JoinInput &left_input, const JoinInput &right_input, - const std::vector> &output_attrs, - size_t estimated_rows) { - JoinSetup setup; - - setup.results = - initialize_output_columns(output_attrs, left_node, right_node, - left_input, right_input, estimated_rows); - - setup.prepared = false; - - return setup; -} +struct TupleTrackingInfo { + SideTrackingInfo build_tracking; ///< Tracking info for build side + SideTrackingInfo probe_tracking; ///< Tracking info for probe side + bool key_from_build = + true; ///< True if parent join key comes from build side +}; /** - * @brief Collects Column pointers for needed output columns from columnar - * input. + * @brief Result of a join execution before intermediate construction. * - * Unused columns get nullptr to skip PageIndex construction. - */ -inline platform::ArenaVector -collect_needed_columns(const JoinInput &input, const PlanNode &node, - const platform::ArenaVector &needed, - platform::ThreadArena &arena) { - platform::ArenaVector columns(arena); - columns.resize(node.output_attrs.size()); - std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); - auto *table = std::get(input.data); - - for (size_t i = 0; i < node.output_attrs.size(); ++i) { - auto [actual_col_idx, _] = node.output_attrs[i]; - columns[i] = needed[i] ? &table->columns[actual_col_idx] : nullptr; - } - return columns; -} - -/** - * @brief Prepares ColumnarReader with columns needed for materialization. + * Contains match buffers and metadata needed for deferred IR construction. + * Allows parent join to decide row ID format based on its cardinality + * requirements before constructing the intermediate result. * - * Triggers lazy PageIndex construction only for projected columns. + * @tparam Mode Match collection mode for this join's buffers. */ -inline void prepare_output_columns( - ColumnarReader &reader, const JoinInput &build_input, - const JoinInput &probe_input, const PlanNode &build_node, - const PlanNode &probe_node, - const std::vector> &remapped_attrs, - size_t build_size) { - - bool build_is_columnar = build_input.is_columnar(); - bool probe_is_columnar = probe_input.is_columnar(); - - if (!build_is_columnar && !probe_is_columnar) - return; - - auto &arena = Contest::platform::get_arena(0); - - platform::ArenaVector build_needed(arena); - build_needed.resize(build_node.output_attrs.size()); - std::memset(build_needed.data(), 0, build_needed.size()); - - platform::ArenaVector probe_needed(arena); - probe_needed.resize(probe_node.output_attrs.size()); - std::memset(probe_needed.data(), 0, probe_needed.size()); - - for (const auto &[col_idx, dtype] : remapped_attrs) { - if (col_idx < build_size) { - if (build_is_columnar) { - build_needed[col_idx] = 1; - } - } else if (probe_is_columnar) { - probe_needed[col_idx - build_size] = 1; - } - } - - if (build_is_columnar) { - reader.prepare_build(collect_needed_columns(build_input, build_node, - build_needed, arena)); - } - - if (probe_is_columnar) { - reader.prepare_probe(collect_needed_columns(probe_input, probe_node, - probe_needed, arena)); - } -} +template struct MatchResult { + std::vector> buffers; + size_t total_count = 0; + + /// The inputs that were joined (for resolving row IDs during IR + /// construction) + JoinInput build_input; + JoinInput probe_input; + + /// Join configuration + const AnalyzedJoinNode *join_node = nullptr; + join::BuildProbeConfig config; + + /// Convenience accessors + size_t count() const { return total_count; } + bool empty() const { return total_count == 0; } +}; -} // namespace Contest::join +} // namespace Contest diff --git a/include/join_execution/match_collector.h b/include/join_execution/match_collector.h index 78657b7..a4136cb 100644 --- a/include/join_execution/match_collector.h +++ b/include/join_execution/match_collector.h @@ -160,6 +160,60 @@ class ThreadLocalMatchBuffer { ChainIterator end() const { return ChainIterator(nullptr, 0); } }; + /** + * @brief Batch reader for efficient SIMD access to chunk chains. + * + * Unlike ChainIterator which reads one element at a time, this reader + * provides direct pointer access to contiguous batches within chunks. + * Essential for SIMD provenance encoding in deferred materialization. + */ + class ChunkBatchReader { + IndexChunk *current_chunk; + uint32_t offset; + size_t remaining; + + public: + ChunkBatchReader(IndexChunk *chunk, size_t count) + : current_chunk(chunk), offset(0), remaining(count) {} + + /** @brief Returns true if more data is available. */ + inline bool has_more() const { return remaining > 0 && current_chunk; } + + /** + * @brief Get pointer to contiguous batch of row IDs. + * + * Returns pointer to up to max_batch contiguous elements within + * current chunk. Actual count may be less if chunk boundary reached. + * + * @param max_batch Maximum elements to return. + * @param actual_count Output: actual number of elements available. + * @return Pointer to contiguous row IDs, or nullptr if exhausted. + */ + inline const uint32_t *get_batch(size_t max_batch, + size_t &actual_count) { + if (!current_chunk || remaining == 0) { + actual_count = 0; + return nullptr; + } + + size_t available = current_chunk->count - offset; + actual_count = std::min({max_batch, remaining, available}); + const uint32_t *ptr = ¤t_chunk->ids[offset]; + + offset += static_cast(actual_count); + remaining -= actual_count; + + if (offset >= current_chunk->count && current_chunk->next) { + current_chunk = current_chunk->next; + offset = 0; + } + return ptr; + } + + /** @brief Remaining element count. */ + inline size_t count() const { return remaining; } + }; + /** @brief Returns range for iterating left (build) row IDs. */ inline ChainRange left_range() const { return ChainRange(left_head, total_count); @@ -170,6 +224,16 @@ class ThreadLocalMatchBuffer { return ChainRange(right_head, total_count); } + /** @brief Returns batch reader for left (build) row IDs. */ + inline ChunkBatchReader left_batch_reader() const { + return ChunkBatchReader(left_head, total_count); + } + + /** @brief Returns batch reader for right (probe) row IDs. */ + inline ChunkBatchReader right_batch_reader() const { + return ChunkBatchReader(right_head, total_count); + } + /** @brief Returns match count in this buffer. */ size_t count() const { return total_count; } diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h index e1086d0..d836409 100644 --- a/include/join_execution/nested_loop.h +++ b/include/join_execution/nested_loop.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,6 @@ */ namespace Contest::join { -using Contest::ExecuteResult; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; @@ -36,6 +34,8 @@ using Contest::platform::worker_pool; * @brief Iterates over non-NULL values in a join input column. * * Abstracts columnar vs intermediate input. Handles NULL bitmaps. + * For IntermediateResult, reads from join_key_tuples if available, + * otherwise from materialized columns (join keys are always available). * * @tparam Func void(uint32_t row_id, int32_t value). */ @@ -69,11 +69,29 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx, } } } else { - const auto &res = std::get(input.data); - const mema::column_t &col = res[attr_idx]; - size_t count = col.row_count(); + const auto &res = std::get(input.data); + + // Check if join key is stored as tuples + if (res.has_join_key_tuples() && res.join_key_idx.has_value() && + *res.join_key_idx == attr_idx) { + const auto &tuples = *res.join_key_tuples; + size_t count = tuples.row_count(); + for (size_t i = 0; i < count; i++) { + mema::KeyRowPair pair = tuples[i]; + if (pair.key != mema::value_t::NULL_VALUE) { + visitor(static_cast(i), pair.key); + } + } + return; + } + + // Fall back to materialized column + const mema::column_t *col = res.get_materialized(attr_idx); + if (!col) + return; // Should not happen - join keys are always available + size_t count = col->row_count(); for (size_t i = 0; i < count; i++) { - const mema::value_t &val = col[i]; + const mema::value_t &val = (*col)[i]; if (!val.is_null()) { visitor(static_cast(i), val.value); } @@ -124,6 +142,7 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, b_vals[i] = INT32_MIN; } + // Setup for columnar probe (page-based parallel processing) const Column *probe_col = nullptr; platform::ArenaVector page_offsets( Contest::platform::get_arena(0)); @@ -140,6 +159,24 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, } page_offsets.push_back(current); } + + // Setup for IntermediateResult probe - check tuples first, then + // materialized + const mema::column_t *probe_mat_col = nullptr; + const mema::key_row_column_t *probe_tuples = nullptr; + if (!probe_input.is_columnar()) { + const auto &res = std::get(probe_input.data); + // Check if join key is stored as tuples + if (res.has_join_key_tuples() && res.join_key_idx.has_value() && + *res.join_key_idx == probe_attr) { + probe_tuples = &(*res.join_key_tuples); + } else { + probe_mat_col = res.get_materialized(probe_attr); + if (!probe_mat_col) + return {}; // Join key not available - should not happen + } + } + std::atomic probe_page_counter{0}; worker_pool().execute([&](size_t t_id) { @@ -189,9 +226,22 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input, } } } + } else if (probe_tuples) { + // IntermediateResult probe - use tuple column + const mema::key_row_column_t &tuples = *probe_tuples; + size_t count = tuples.row_count(); + size_t start = (t_id * count) / THREAD_COUNT; + size_t end = ((t_id + 1) * count) / THREAD_COUNT; + + for (size_t i = start; i < end; i++) { + mema::KeyRowPair pair = tuples[i]; + if (pair.key != mema::value_t::NULL_VALUE) { + process_value(static_cast(i), pair.key); + } + } } else { - const auto &res = std::get(probe_input.data); - const mema::column_t &col = res[probe_attr]; + // IntermediateResult probe - use materialized column + const mema::column_t &col = *probe_mat_col; size_t count = col.row_count(); size_t start = (t_id * count) / THREAD_COUNT; size_t end = ((t_id + 1) * count) / THREAD_COUNT; diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h index 45a4386..ec3db86 100644 --- a/include/materialization/construct_intermediate.h +++ b/include/materialization/construct_intermediate.h @@ -2,122 +2,502 @@ * @file construct_intermediate.h * @brief Constructs intermediate results for multi-way joins. * - * Allocates and populates ExecuteResult (column_t) from match collectors. - * Templated on MatchCollectionMode for zero-overhead mode selection. + * Allocates and populates IntermediateResult with only MATERIALIZE columns + * (typically just the parent's join key). Deferred columns use per-table + * 32-bit row ID storage for memory efficiency. + * + * Optimized with: + * - Column-major iteration for cache locality + * - Precomputed source metadata to avoid per-row variant access + * - Per-table 32-bit row ID storage (vs per-column 64-bit provenance) + * - Batch access to match collector chunks + * + * @see materialize.h for final resolution of deferred columns. */ #pragma once +#include +#include +#include + #include +#include #include -#include +#include #include #include #include #include -#include -/** - * @namespace Contest::materialize - * @brief Materialization of join results into columnar format. - * - * @see intermediate.h for column_t/value_t format details. - */ -namespace Contest::materialize { -using Contest::ExecuteResult; +namespace Contest { +namespace materialize { + using Contest::io::ColumnarReader; -using Contest::join::JoinInput; using Contest::join::MatchCollectionMode; using Contest::join::ThreadLocalMatchBuffer; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; +// ============================================================================ +// Row ID Batch Operations (for 32-bit per-table deferred) +// ============================================================================ + +namespace row_id_ops { + /** - * @brief Precomputed metadata for resolving an output column's source. + * @brief Write row IDs directly from columnar input. * - * Avoids per-value std::variant accesses and tuple lookups in hot loop. - * 8-byte alignment optimizes struct packing for vector iteration. + * For columnar inputs, we just write the row_id directly (it's already + * the base table row ID). Optimized with memcpy when batch fits in one page. + */ +inline size_t write_row_ids_direct(mema::DeferredTable &dest, size_t start_idx, + const uint32_t *row_ids, size_t count) { + // Constants for DeferredTable layout + constexpr size_t ENTRY_SHIFT = mema::DeferredTable::ENTRY_SHIFT; + constexpr size_t ENTRY_MASK = mema::DeferredTable::ENTRY_MASK; + + size_t page_idx = start_idx >> ENTRY_SHIFT; + size_t offset = start_idx & ENTRY_MASK; + + // Fast path: entire batch fits in current page + if (offset + count <= mema::DeferredTable::ENTRIES_PER_PAGE) { + std::memcpy(&dest.pages[page_idx]->data[offset], row_ids, + count * sizeof(uint32_t)); + return count; + } + + // Slow path: batch spans pages + for (size_t i = 0; i < count; ++i) { + dest.write_at(start_idx + i, row_ids[i]); + } + return count; +} + +/** + * @brief Copy row IDs from child deferred table. + * + * For intermediate inputs, we look up the base table row ID from the + * child's deferred table and copy it to the parent's deferred table. + */ +inline size_t copy_row_ids_from_child(mema::DeferredTable &dest, + size_t start_idx, + const mema::DeferredTable &src, + const uint32_t *row_ids, size_t count) { + for (size_t i = 0; i < count; ++i) { + dest.write_at(start_idx + i, src[row_ids[i]]); + } + return count; +} + +} // namespace row_id_ops + +// ============================================================================ +// Source Precomputation Structures +// ============================================================================ + +/** + * @brief Precomputed metadata for a deferred table source. * - * @see prepare_sources() for precomputation logic. + * Groups columns by (from_build, base_table_id) so we only store 32-bit + * row IDs once per unique base table instead of 64-bit provenance per column. */ -struct alignas(8) SourceInfo { +struct DeferredTableSource { + const mema::DeferredTable *child_table = + nullptr; ///< Source deferred table from child (if any). + uint8_t base_table_id = 0; ///< Base table ID. + uint8_t dest_table_idx = 0; ///< Index in result.deferred_tables[]. + bool from_build = false; ///< True if from build side. + bool needs_direct = false; ///< True if columnar (write row IDs directly). +}; + +/** + * @brief Precomputed metadata for materialized column sources. + * + * Eliminates per-row std::variant access and conditional checks in hot loop. + */ +struct alignas(8) MaterializedColumnSource { const mema::column_t *intermediate_col = - nullptr; /**< Source if intermediate. */ - const Column *columnar_col = nullptr; /**< Source if columnar. */ - size_t remapped_col_idx = 0; /**< Local index within source side. */ - bool is_columnar = false; /**< True if source is columnar table. */ - bool from_build = false; /**< True if from build side, false if probe. */ + nullptr; ///< Source if from IntermediateResult materialized + const Column *columnar_col = nullptr; ///< Source if from ColumnarTable + const mema::DeferredTable *deferred_table = + nullptr; ///< Source deferred table if needs resolution + const mema::key_row_column_t *tuple_col = + nullptr; ///< Source if from child's join_key_tuples + size_t child_output_idx = 0; ///< Index in child's output + size_t mat_col_idx = 0; ///< Index in result.materialized[] + DataType type = DataType::INT32; + uint8_t base_table_id = 0; ///< For VARCHAR source tracking + uint8_t base_column_idx = 0; ///< For VARCHAR source tracking + bool is_columnar = false; ///< True if source is ColumnarTable + bool from_build = false; ///< True if from build side + bool needs_deferred_resolve = false; ///< True if child deferred this column + bool needs_tuple_key_read = false; ///< True if reading key from tuples }; +// ============================================================================ +// Helper Functions +// ============================================================================ + +/** + * @brief Collect columns needed from a JoinInput for page index building. + */ +inline platform::ArenaVector +collect_input_columns(const JoinInput &input, + const platform::ArenaVector &needed, + platform::ThreadArena &arena) { + platform::ArenaVector columns(arena); + if (!input.node) + return columns; + + columns.resize(input.node->output_attrs.size()); + std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); + + if (!input.is_columnar()) + return columns; + + auto *table = std::get(input.data); + for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { + if (i < needed.size() && needed[i]) { + auto [actual_col_idx, _] = input.node->output_attrs[i]; + columns[i] = &table->columns[actual_col_idx]; + } + } + return columns; +} + +/** + * @brief Prepare ColumnarReader for intermediate construction. + * + * Sets up page indices for columns that need to be read from columnar inputs. + * If parent_key_idx is provided, also prepares the join key column for tuple + * population. + */ +inline void prepare_intermediate_columns( + ColumnarReader &reader, const JoinInput &build_input, + const JoinInput &probe_input, const AnalyzedJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left, + std::optional parent_key_idx = std::nullopt) { + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + if (!build_is_columnar && !probe_is_columnar) + return; + + auto &arena = Contest::platform::get_arena(0); + + // Determine which columns from each side are needed + platform::ArenaVector build_needed(arena); + if (build_input.node) { + build_needed.resize(build_input.node->output_attrs.size()); + std::memset(build_needed.data(), 0, build_needed.size()); + } + + platform::ArenaVector probe_needed(arena); + if (probe_input.node) { + probe_needed.resize(probe_input.node->output_attrs.size()); + std::memset(probe_needed.data(), 0, probe_needed.size()); + } + + // Mark columns needed based on materialization decisions + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && + col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + } + } + + // If parent needs a join key via tuples, mark that column as needed too + // This ensures page indices are prepared for efficient tuple population + if (parent_key_idx.has_value()) { + for (const auto &col : join_node.columns) { + if (col.original_idx == *parent_key_idx) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && + col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + break; + } + } + } + + if (build_is_columnar) { + reader.prepare_build( + collect_input_columns(build_input, build_needed, arena)); + } + + if (probe_is_columnar) { + reader.prepare_probe( + collect_input_columns(probe_input, probe_needed, arena)); + } +} + +/** + * @brief Prepare page indices for base table columns used in deferred + * resolution. + * + * Called before constructing intermediate results to enable O(log P) page + * lookup instead of O(P) linear scan when resolving deferred columns that need + * to materialize values from base tables. + * + * @param reader ColumnarReader to prepare page indices in. + * @param mat_sources Precomputed materialized column sources. + * @param analyzed_plan Full analyzed plan containing base tables. + */ +inline void prepare_deferred_base_tables( + ColumnarReader &reader, + const std::vector &mat_sources, + const AnalyzedPlan &analyzed_plan) { + if (!analyzed_plan.original_plan) + return; + + // NOTE: We do NOT reset base tables here - they persist across joins + // within the same query since the base tables don't change. + // reset_base_tables() should only be called once per query, externally. + + // Prepare page indices for each base table column that needs deferred + // resolve + for (const auto &src : mat_sources) { + if (src.needs_deferred_resolve) { + uint8_t table_id = src.base_table_id; + uint8_t col_idx = src.base_column_idx; + + if (!reader.is_base_column_prepared(table_id, col_idx)) { + if (table_id < analyzed_plan.original_plan->inputs.size()) { + const auto &base_table = + analyzed_plan.original_plan->inputs[table_id]; + if (col_idx < base_table.columns.size()) { + reader.prepare_base_column(table_id, col_idx, + base_table.columns[col_idx]); + } + } + } + } + } +} + +/** + * @brief Create empty intermediate result with proper schema. + */ +inline IntermediateResult +create_empty_intermediate_result(const AnalyzedJoinNode &node) { + IntermediateResult result; + result.node_info = &node; + result.num_rows = 0; + result.materialized_map.resize(node.columns.size(), std::nullopt); + result.deferred_map.resize(node.columns.size(), std::nullopt); + + size_t mat_count = 0; + for (const auto &col : node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + result.materialized_map[col.original_idx] = mat_count++; + } + // For empty result, we don't need to set up deferred tables + } + result.materialized.resize(mat_count); + + return result; +} + +/** + * @brief Prepare deferred table sources for intermediate construction. + * + * Groups deferred columns by (from_build, base_table_id) to create + * DeferredTable entries. Returns list of sources for populating the tables. + */ +inline std::vector +prepare_deferred_table_sources(const AnalyzedJoinNode &join_node, + const JoinInput &build_input, + const JoinInput &probe_input, bool build_is_left, + IntermediateResult &out_result) { + // Map from (from_build << 8 | base_table_id) -> dest_table_idx + std::unordered_map table_key_to_idx; + std::vector sources; + + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::DEFER) + continue; + + bool from_build = (col.from_left == build_is_left); + uint16_t key = (static_cast(from_build) << 8) | + col.provenance.base_table_id; + + auto it = table_key_to_idx.find(key); + uint8_t dest_idx; + + if (it == table_key_to_idx.end()) { + // New deferred table needed + dest_idx = static_cast(out_result.deferred_tables.size()); + table_key_to_idx[key] = dest_idx; + + mema::DeferredTable dt; + dt.base_table_id = col.provenance.base_table_id; + dt.from_build = from_build; + out_result.deferred_tables.push_back(std::move(dt)); + + // Create source entry + DeferredTableSource src; + src.base_table_id = col.provenance.base_table_id; + src.dest_table_idx = dest_idx; + src.from_build = from_build; + + const auto &src_input = from_build ? build_input : probe_input; + if (src_input.is_columnar()) { + src.needs_direct = true; + src.child_table = nullptr; + } else { + const auto &child_ir = + std::get(src_input.data); + // Find child's deferred table for this base table + const auto *child_ref = + src_input.get_deferred_ref(col.child_output_idx); + if (child_ref) { + src.needs_direct = false; + src.child_table = + src_input.get_deferred_table(col.child_output_idx); + } else if (child_ir.is_join_key(col.child_output_idx)) { + // Child stored this as tuples - the row_id in tuples + // is an IR index, but we need base table row IDs for + // deferred resolution. This shouldn't happen if the + // join key column is properly excluded from DEFER. +#ifndef NDEBUG + std::fprintf(stderr, + "[BUG] DEFER column %zu is child's " + "join key - this is unexpected!\n", + col.child_output_idx); +#endif + src.needs_direct = true; + src.child_table = nullptr; + } else { + // Child materialized this, shouldn't happen for DEFER cols + src.needs_direct = true; + src.child_table = nullptr; + } + } + sources.push_back(src); + } else { + dest_idx = it->second; + } + + // Add column to deferred table's column list + out_result.deferred_tables[dest_idx].column_indices.push_back( + col.provenance.base_column_idx); + + // Set up deferred_map entry + DeferredColumnRef ref; + ref.table_idx = dest_idx; + ref.base_col = col.provenance.base_column_idx; + out_result.deferred_map[col.original_idx] = ref; + } + + return sources; +} + /** - * @brief Builds SourceInfo for each output column for fast hot-loop lookup. - * - * @param remapped_attrs Output column specifications (global indexing). - * @param build_input Build side data (ColumnarTable* or ExecuteResult). - * @param probe_input Probe side data (ColumnarTable* or ExecuteResult). - * @param build_node PlanNode for build side (contains output_attrs). - * @param probe_node PlanNode for probe side (contains output_attrs). - * @param build_size Number of columns from build side. - * @return Vector of SourceInfo, one per output column. - * - * @see SourceInfo for field documentation. - * @see construct_intermediate() for consumption in hot loop. + * @brief Precompute materialized column sources for column-major iteration. + * + * For each MATERIALIZE column, determines source type and caches pointers + * to avoid per-row std::variant access in the hot loop. */ -inline std::vector -prepare_sources(const std::vector> &remapped_attrs, - const JoinInput &build_input, const JoinInput &probe_input, - const PlanNode &build_node, const PlanNode &probe_node, - size_t build_size) { - std::vector sources; - sources.reserve(remapped_attrs.size()); - for (const auto &[col_idx, _] : remapped_attrs) { - SourceInfo info; - info.from_build = (col_idx < build_size); - size_t local_idx = info.from_build ? col_idx : col_idx - build_size; - info.remapped_col_idx = local_idx; - const JoinInput &input = info.from_build ? build_input : probe_input; - const PlanNode &node = info.from_build ? build_node : probe_node; - if (input.is_columnar()) { - info.is_columnar = true; - auto *table = std::get(input.data); - auto [actual_idx, _] = node.output_attrs[local_idx]; - info.columnar_col = &table->columns[actual_idx]; +inline std::vector +prepare_materialized_sources(const AnalyzedJoinNode &join_node, + const JoinInput &build_input, + const JoinInput &probe_input, bool build_is_left) { + std::vector sources; + sources.reserve(join_node.columns.size()); + + size_t mat_idx = 0; + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + + MaterializedColumnSource src; + src.mat_col_idx = mat_idx++; + src.child_output_idx = col.child_output_idx; + src.type = col.type; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.is_columnar = true; + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col.child_output_idx]; + src.columnar_col = &table->columns[actual_idx]; } else { - info.is_columnar = false; - const auto &res = std::get(input.data); - info.intermediate_col = &res[local_idx]; + src.is_columnar = false; + const auto &ir = std::get(src_input.data); + + // Check source type in priority order: + // 1. Tuples (join key stored as key-row pairs) + // 2. Materialized column + // 3. Deferred table + if (ir.is_join_key(col.child_output_idx)) { + // Child stored this column as tuples - read key from there + src.needs_tuple_key_read = true; + src.tuple_col = &(*ir.join_key_tuples); + } else if (ir.is_materialized(col.child_output_idx)) { + src.intermediate_col = + ir.get_materialized(col.child_output_idx); + } else if (ir.is_deferred(col.child_output_idx)) { + src.needs_deferred_resolve = true; + src.deferred_table = + ir.get_deferred_table(col.child_output_idx); + // base_column_idx is already set from col.provenance + } } - sources.push_back(info); + sources.push_back(src); } + return sources; } +// ============================================================================ +// Main Construction Function +// ============================================================================ + /** - * @brief Constructs intermediate results directly from thread-local buffers. + * @brief Constructs intermediate result from thread-local buffers. * - * Each thread iterates its own buffer, avoiding the merge step. Total matches - * computed by summing buffer counts. Each thread writes its contiguous portion - * of output pages. + * Optimized with column-major iteration and per-table 32-bit row ID storage. + * Only materializes columns marked MATERIALIZE in the AnalyzedJoinNode. + * Deferred columns share row ID storage per unique base table. * * @tparam Mode Collection mode for compile-time specialization. - * @param buffers Vector of ThreadLocalMatchBuffer from probe. - * @param build_input Build side data (ColumnarTable* or ExecuteResult). - * @param probe_input Probe side data (ColumnarTable* or ExecuteResult). - * @param remapped_attrs Output column specifications (global indexing). - * @param build_node PlanNode for build side output_attrs mapping. - * @param probe_node PlanNode for probe side output_attrs mapping. - * @param build_size Number of output columns from build side. - * @param columnar_reader ColumnarReader with Cursor caching for page access. - * @param results Pre-initialized ExecuteResult, populated in-place. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side data source. + * @param probe_input Probe side data source. + * @param join_node Analyzed join node with materialization decisions. + * @param remapped_attrs Output attributes (after build/probe remapping). + * @param build_output_size Number of columns from build side. + * @param build_is_left True if build side is the original left child. + * @param columnar_reader Reader for columnar data access. + * @param out_result Output IntermediateResult (populated in-place). + * @param analyzed_plan Full analyzed plan for base table access. */ template -inline void construct_intermediate_from_buffers( +void construct_intermediate_from_buffers( std::vector> &buffers, const JoinInput &build_input, const JoinInput &probe_input, + const AnalyzedJoinNode &join_node, const std::vector> &remapped_attrs, - const PlanNode &build_node, const PlanNode &probe_node, size_t build_size, - ColumnarReader &columnar_reader, ExecuteResult &results) { + size_t build_output_size, bool build_is_left, + ColumnarReader &columnar_reader, IntermediateResult &out_result, + const AnalyzedPlan &analyzed_plan) { - // Compute total matches and per-buffer start offsets + // Count total matches and compute buffer start offsets size_t total_matches = 0; std::vector buffer_starts(buffers.size()); for (size_t i = 0; i < buffers.size(); ++i) { @@ -125,40 +505,94 @@ inline void construct_intermediate_from_buffers( total_matches += buffers[i].count(); } - if (total_matches == 0) + if (total_matches == 0) { + out_result = create_empty_intermediate_result(join_node); return; + } - auto sources = prepare_sources(remapped_attrs, build_input, probe_input, - build_node, probe_node, build_size); + // Initialize result metadata + out_result.node_info = &join_node; + out_result.num_rows = total_matches; + out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); + out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); - const size_t num_threads = THREAD_COUNT; - const size_t num_cols = sources.size(); + // Count materialized columns and set up maps + size_t mat_count = 0; + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE) { + out_result.materialized_map[col.original_idx] = mat_count++; + } + } + + // Prepare deferred table sources (this populates deferred_tables and + // deferred_map) + auto deferred_sources = prepare_deferred_table_sources( + join_node, build_input, probe_input, build_is_left, out_result); - // Pre-size page vectors for each column + // Precompute materialized sources + auto mat_sources = prepare_materialized_sources(join_node, build_input, + probe_input, build_is_left); + + // Prepare page indices for base tables used in deferred resolution + prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan); + + // Pre-allocate pages using Page = mema::column_t::Page; - size_t total_pages_needed = + using DeferredPage = mema::DeferredTable::Page; + size_t mat_pages_needed = (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; + size_t def_pages_needed = + (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) / + mema::DeferredTable::ENTRIES_PER_PAGE; - for (size_t c = 0; c < num_cols; ++c) { - auto &col = results[c]; - col.pages.resize(total_pages_needed); - col.set_row_count(total_matches); + out_result.materialized.resize(mat_count); + for (size_t c = 0; c < mat_count; ++c) { + out_result.materialized[c].pages.resize(mat_pages_needed); + out_result.materialized[c].set_row_count(total_matches); } - // Parallel page allocation - each thread allocates its own pages + for (auto &dt : out_result.deferred_tables) { + dt.pages.resize(def_pages_needed); + dt.set_row_count(total_matches); + } + + // Set source metadata for materialized columns + for (const auto &src : mat_sources) { + out_result.materialized[src.mat_col_idx].source_table = + src.base_table_id; + out_result.materialized[src.mat_col_idx].source_column = + src.base_column_idx; + } + + const size_t num_threads = THREAD_COUNT; + const size_t num_deferred_tables = out_result.deferred_tables.size(); + + // Parallel page allocation worker_pool().execute([&](size_t t) { - for (size_t c = 0; c < num_cols; ++c) { - auto &col = results[c]; - for (size_t p = t; p < total_pages_needed; p += num_threads) { + for (size_t c = 0; c < mat_count; ++c) { + auto &col = out_result.materialized[c]; + for (size_t p = t; p < mat_pages_needed; p += num_threads) { void *ptr = Contest::platform::get_arena(t) .alloc_chunk(); col.pages[p] = reinterpret_cast(ptr); } } + for (size_t d = 0; d < num_deferred_tables; ++d) { + auto &dt = out_result.deferred_tables[d]; + for (size_t p = t; p < def_pages_needed; p += num_threads) { + // Use IR_PAGE (16KB) for DeferredTable pages + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + dt.pages[p] = reinterpret_cast(ptr); + } + } }); - // Parallel: each thread processes its own buffer + // ======================================================================== + // COLUMN-MAJOR PARALLEL POPULATION + // ======================================================================== worker_pool().execute([&](size_t t) { if (t >= buffers.size()) return; @@ -168,50 +602,540 @@ inline void construct_intermediate_from_buffers( return; size_t start = buffer_starts[t]; - Contest::ColumnarReader::Cursor cursor; + ColumnarReader::Cursor cursor; + ColumnarReader::Cursor base_cursor; // For deferred resolution reads - for (size_t c = 0; c < num_cols; ++c) { - const auto &src = sources[c]; - auto &dest_col = results[c]; + // ==================================================================== + // Process MATERIALIZED columns (column-major for cache locality) + // ==================================================================== + for (const auto &src : mat_sources) { + auto &dest_col = out_result.materialized[src.mat_col_idx]; - auto left_range = buf.left_range(); - auto right_range = buf.right_range(); + // Get appropriate range based on which side this column comes from + auto range = src.from_build ? buf.left_range() : buf.right_range(); if (src.is_columnar) { + // Columnar source - use ColumnarReader with cursor caching const auto &col = *src.columnar_col; - if (src.from_build) { - size_t k = start; - for (uint32_t rid : left_range) { - dest_col.write_at(k++, - columnar_reader.read_value( - col, src.remapped_col_idx, rid, - col.type, cursor, true)); + size_t k = start; + for (uint32_t rid : range) { + dest_col.write_at(k++, + columnar_reader.read_value( + col, src.child_output_idx, rid, + src.type, cursor, src.from_build)); + } + } else if (src.needs_tuple_key_read && src.tuple_col) { + // Child stored this column as tuples - read key from there + const auto &tuples = *src.tuple_col; + size_t k = start; + for (uint32_t rid : range) { + int32_t key = tuples.key_at(rid); + dest_col.write_at(k++, mema::value_t{key}); + } + } else if (src.intermediate_col) { + // Intermediate materialized source - direct copy + const auto &vec = *src.intermediate_col; + size_t k = start; + for (uint32_t rid : range) { + dest_col.write_at(k++, vec[rid]); + } + } else if (src.needs_deferred_resolve && src.deferred_table) { + // Deferred in child - resolve via deferred table + base table + const auto &def_table = *src.deferred_table; + size_t k = start; + for (uint32_t rid : range) { + uint32_t base_row = def_table[rid]; + + if (analyzed_plan.original_plan) [[likely]] { + const auto &base_table = + analyzed_plan.original_plan + ->inputs[src.base_table_id]; + mema::value_t val = + columnar_reader.read_base_table_value( + base_table.columns[src.base_column_idx], + src.base_table_id, src.base_column_idx, + base_row, src.type, base_cursor); + dest_col.write_at(k++, val); + } else { + dest_col.write_at( + k++, mema::value_t{mema::value_t::NULL_VALUE}); } - } else { - size_t k = start; - for (uint32_t rid : right_range) { - dest_col.write_at(k++, - columnar_reader.read_value( - col, src.remapped_col_idx, rid, - col.type, cursor, false)); + } + } + } + + // ==================================================================== + // Process DEFERRED tables (one pass per unique base table) + // ==================================================================== + for (const auto &def_src : deferred_sources) { + auto &dest_table = + out_result.deferred_tables[def_src.dest_table_idx]; + + auto batch_reader = def_src.from_build ? buf.left_batch_reader() + : buf.right_batch_reader(); + + size_t k = start; + while (batch_reader.has_more()) { + size_t batch_count; + const uint32_t *row_ids = + batch_reader.get_batch(256, batch_count); + + if (batch_count > 0) { + if (def_src.needs_direct) { + // Columnar input: write row IDs directly + row_id_ops::write_row_ids_direct(dest_table, k, row_ids, + batch_count); + } else if (def_src.child_table) { + // Intermediate input: copy from child's deferred table + row_id_ops::copy_row_ids_from_child( + dest_table, k, *def_src.child_table, row_ids, + batch_count); } + k += batch_count; + } + } + } + }); +} + +// ============================================================================ +// Tuple-Based Intermediate Construction +// ============================================================================ + +/** + * @brief Resolves a row ID to base table row ID if possible. + * + * For columnar inputs: row ID is already base row ID (direct). + * For IR with tuples storing base rows: lookup via key_row_column_t. + * For IR with tuples storing IR indices: lookup via deferred table. + * For IR without tuples: lookup via deferred table. + * + * @param input The JoinInput to resolve from. + * @param row_id The row ID from match buffer. + * @param key_col_idx The join key column index in input's output. + * @return Resolved base table row ID. + */ +inline uint32_t resolve_to_base_row(const JoinInput &input, uint32_t row_id, + size_t key_col_idx) { + if (input.is_columnar()) { + // Columnar input: row ID is already base table row + return row_id; + } + + const auto &ir = std::get(input.data); + + if (ir.has_join_key_tuples() && ir.join_key_has_base_rows()) { + // IR stores base row IDs in tuples - one lookup + return ir.join_key_tuples->row_id_at(row_id); + } + + // IR stores IR indices - need deferred table lookup + const auto *def_table = ir.get_deferred_table(key_col_idx); + if (def_table) { + return (*def_table)[row_id]; + } + + // Fallback: return as-is (shouldn't happen for correct plans) + return row_id; +} + +/** + * @brief Populates join key tuples column from match buffers. + * + * Extracts join keys and resolves row IDs based on tracking configuration. + * For tracked side with base rows, embeds base table row IDs directly. + * For non-tracked side, embeds IR indices for later DeferredTable lookup. + * + * @tparam Mode Match collection mode. + * @param buffers Thread-local match buffers. + * @param buffer_starts Per-buffer write offsets. + * @param build_input Build side input. + * @param probe_input Probe side input. + * @param key_from_build True if parent's join key comes from build side. + * @param key_child_output_idx Column index in the key input's output. + * @param out_tuples Output tuple column (pre-allocated). + * @param columnar_reader Reader for columnar access. + */ +template +void populate_join_key_tuples( + std::vector> &buffers, + const std::vector &buffer_starts, const JoinInput &build_input, + const JoinInput &probe_input, bool key_from_build, + size_t key_child_output_idx, mema::key_row_column_t &out_tuples, + ColumnarReader &columnar_reader) { + + const JoinInput &key_input = key_from_build ? build_input : probe_input; + size_t key_attr = key_child_output_idx; + + worker_pool().execute([&](size_t t) { + if (t >= buffers.size()) + return; + auto &buf = buffers[t]; + size_t my_count = buf.count(); + if (my_count == 0) + return; + + size_t write_pos = buffer_starts[t]; + + // Get the appropriate range based on which side provides the key + auto range = key_from_build ? buf.left_range() : buf.right_range(); + + if (key_input.is_columnar()) { + // Columnar source - read key from base table using prepared page + // index Store OUTPUT IR index (write_pos) so parent can use it to + // index into this IR + auto *table = std::get(key_input.data); + auto [actual_col_idx, _] = key_input.node->output_attrs[key_attr]; + const Column &col = table->columns[actual_col_idx]; + + // Use cursor for efficient sequential/near-sequential access + ColumnarReader::Cursor cursor; + for (uint32_t row_id : range) { + // Use read_value with prepared page index (O(1) amortized) + // instead of read_value_direct_public (O(n) per read) + int32_t key = + columnar_reader + .read_value(col, key_attr, row_id, DataType::INT32, + cursor, key_from_build) + .value; + // Store OUTPUT IR index (write_pos), not base table row_id + // Parent needs IR index to access other columns in this IR + uint32_t output_ir_idx = static_cast(write_pos); + out_tuples.write_at(write_pos++, {key, output_ir_idx}); + } + } else { + // Intermediate source - store OUTPUT IR index + const auto &ir = std::get(key_input.data); + + // Only propagate existing tuples if they contain the column we need + // Otherwise, read from materialized column + if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() && + *ir.join_key_idx == key_attr) { + // IR's tuples contain the column we need - propagate directly + const auto &src_tuples = *ir.join_key_tuples; + + for (uint32_t ir_idx : range) { + mema::KeyRowPair src = src_tuples[ir_idx]; + // Store OUTPUT IR index for parent to index into this IR + uint32_t output_ir_idx = static_cast(write_pos); + out_tuples.write_at(write_pos++, {src.key, output_ir_idx}); } } else { + // IR's tuples contain a different column, or no tuples exist + // Read from materialized column instead + const auto *mat_col = ir.get_materialized(key_attr); + if (mat_col) { + for (uint32_t ir_idx : range) { + int32_t key = (*mat_col)[ir_idx].value; + // Store OUTPUT IR index for parent to index into this + // IR + uint32_t output_ir_idx = + static_cast(write_pos); + out_tuples.write_at(write_pos++, {key, output_ir_idx}); + } + } + } + } + }); +} + +/** + * @brief Constructs intermediate result with tuple-based join key storage. + * + * Stores join key as (value, row_id) tuples for accelerated hashtable build + * and zero-indirection row ID propagation. Other columns handled normally + * via deferred tables or materialization. + * + * @tparam Mode Collection mode for compile-time specialization. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side data source. + * @param probe_input Probe side data source. + * @param join_node Analyzed join node with materialization decisions. + * @param config Build/probe configuration. + * @param build_is_left True if build side is the original left child. + * @param parent_key_idx Index of column that will be parent's join key. + * @param columnar_reader Reader for columnar data access. + * @param out_result Output IntermediateResult (populated in-place). + * @param analyzed_plan Full analyzed plan for base table access. + */ +template +void construct_intermediate_with_tuples( + std::vector> &buffers, + const JoinInput &build_input, const JoinInput &probe_input, + const AnalyzedJoinNode &join_node, const join::BuildProbeConfig &config, + bool build_is_left, size_t parent_key_idx, ColumnarReader &columnar_reader, + IntermediateResult &out_result, const AnalyzedPlan &analyzed_plan) { + + // Count total matches and compute buffer start offsets + size_t total_matches = 0; + std::vector buffer_starts(buffers.size()); + for (size_t i = 0; i < buffers.size(); ++i) { + buffer_starts[i] = total_matches; + total_matches += buffers[i].count(); + } + + if (total_matches == 0) { + out_result = create_empty_intermediate_result(join_node); + return; + } + + // Initialize result metadata + out_result.node_info = &join_node; + out_result.num_rows = total_matches; + out_result.materialized_map.resize(join_node.columns.size(), std::nullopt); + out_result.deferred_map.resize(join_node.columns.size(), std::nullopt); + + // Determine if parent's join key comes from build or probe side + // and which base table it traces back to + bool key_from_build = true; + size_t key_child_output_idx = 0; // Column index in child's output + uint8_t key_base_table_id = 0; + uint8_t key_base_column = 0; + + for (const auto &col : join_node.columns) { + if (col.original_idx == parent_key_idx) { + key_from_build = (col.from_left == build_is_left); + key_child_output_idx = col.child_output_idx; + key_base_table_id = col.provenance.base_table_id; + key_base_column = col.provenance.base_column_idx; + break; + } + } + + // Allocate join key tuples column + out_result.join_key_tuples.emplace(); + out_result.join_key_tuples->pre_allocate_from_arena( + Contest::platform::get_arena(0), total_matches); + out_result.join_key_tuples->base_table_id = key_base_table_id; + out_result.join_key_tuples->source_column = key_base_column; + // Always store OUTPUT IR indices (not base row IDs) so parent can + // index into this IR to access deferred columns + out_result.join_key_tuples->stores_base_row_ids = false; + out_result.join_key_idx = parent_key_idx; + const JoinInput &key_input = key_from_build ? build_input : probe_input; + (void)key_input; // Used in populate_join_key_tuples + + // Count non-join-key materialized columns and set up maps + size_t mat_count = 0; + for (const auto &col : join_node.columns) { + if (col.resolution == ColumnResolution::MATERIALIZE && + col.original_idx != parent_key_idx) { + out_result.materialized_map[col.original_idx] = mat_count++; + } + } + + // Prepare deferred table sources (unchanged from non-tuple version) + auto deferred_sources = prepare_deferred_table_sources( + join_node, build_input, probe_input, build_is_left, out_result); + + // Precompute materialized sources (excluding join key) + std::vector mat_sources; + mat_sources.reserve(join_node.columns.size()); + size_t mat_idx = 0; + for (const auto &col : join_node.columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + if (col.original_idx == parent_key_idx) + continue; // Skip join key - handled via tuples + + MaterializedColumnSource src; + src.mat_col_idx = mat_idx++; + src.child_output_idx = col.child_output_idx; + src.type = col.type; + src.base_table_id = col.provenance.base_table_id; + src.base_column_idx = col.provenance.base_column_idx; + src.from_build = (col.from_left == build_is_left); + + const auto &src_input = src.from_build ? build_input : probe_input; + + if (src_input.is_columnar()) { + src.is_columnar = true; + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col.child_output_idx]; + src.columnar_col = &table->columns[actual_idx]; + } else { + src.is_columnar = false; + const auto &ir = std::get(src_input.data); + + // Check source type in priority order: + // 1. Tuples (join key stored as key-row pairs) + // 2. Materialized column + // 3. Deferred table + if (ir.is_join_key(col.child_output_idx)) { + // Child stored this column as tuples - read key from there + src.needs_tuple_key_read = true; + src.tuple_col = &(*ir.join_key_tuples); + } else if (ir.is_materialized(col.child_output_idx)) { + src.intermediate_col = + ir.get_materialized(col.child_output_idx); + } else if (ir.is_deferred(col.child_output_idx)) { + src.needs_deferred_resolve = true; + src.deferred_table = + ir.get_deferred_table(col.child_output_idx); + } + } + mat_sources.push_back(src); + } + + // Prepare page indices for base tables used in deferred resolution + prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan); + + // Pre-allocate pages + using Page = mema::column_t::Page; + using DeferredPage = mema::DeferredTable::Page; + size_t mat_pages_needed = + (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE; + size_t def_pages_needed = + (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) / + mema::DeferredTable::ENTRIES_PER_PAGE; + + out_result.materialized.resize(mat_count); + for (size_t c = 0; c < mat_count; ++c) { + out_result.materialized[c].pages.resize(mat_pages_needed); + out_result.materialized[c].set_row_count(total_matches); + } + + for (auto &dt : out_result.deferred_tables) { + dt.pages.resize(def_pages_needed); + dt.set_row_count(total_matches); + } + + // Set source metadata for materialized columns + for (const auto &src : mat_sources) { + out_result.materialized[src.mat_col_idx].source_table = + src.base_table_id; + out_result.materialized[src.mat_col_idx].source_column = + src.base_column_idx; + } + + const size_t num_threads = THREAD_COUNT; + const size_t num_deferred_tables = out_result.deferred_tables.size(); + + // Parallel page allocation + worker_pool().execute([&](size_t t) { + for (size_t c = 0; c < mat_count; ++c) { + auto &col = out_result.materialized[c]; + for (size_t p = t; p < mat_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + col.pages[p] = reinterpret_cast(ptr); + } + } + for (size_t d = 0; d < num_deferred_tables; ++d) { + auto &dt = out_result.deferred_tables[d]; + for (size_t p = t; p < def_pages_needed; p += num_threads) { + void *ptr = + Contest::platform::get_arena(t) + .alloc_chunk(); + dt.pages[p] = reinterpret_cast(ptr); + } + } + }); + + // Populate join key tuples + populate_join_key_tuples( + buffers, buffer_starts, build_input, probe_input, key_from_build, + key_child_output_idx, *out_result.join_key_tuples, columnar_reader); + + // Populate other materialized columns and deferred tables + // (same logic as construct_intermediate_from_buffers) + worker_pool().execute([&](size_t t) { + if (t >= buffers.size()) + return; + auto &buf = buffers[t]; + size_t my_count = buf.count(); + if (my_count == 0) + return; + + size_t start = buffer_starts[t]; + ColumnarReader::Cursor cursor; + ColumnarReader::Cursor base_cursor; // For deferred resolution reads + + // Process MATERIALIZED columns (excluding join key) + for (const auto &src : mat_sources) { + auto &dest_col = out_result.materialized[src.mat_col_idx]; + + auto range = src.from_build ? buf.left_range() : buf.right_range(); + + if (src.is_columnar) { + const auto &col = *src.columnar_col; + size_t k = start; + for (uint32_t rid : range) { + mema::value_t val = columnar_reader.read_value( + col, src.child_output_idx, rid, src.type, cursor, + src.from_build); + dest_col.write_at(k++, val); + } + } else if (src.needs_tuple_key_read && src.tuple_col) { + // Child stored this column as tuples - read key from there + const auto &tuples = *src.tuple_col; + size_t k = start; + for (uint32_t rid : range) { + int32_t key = tuples.key_at(rid); + dest_col.write_at(k++, mema::value_t{key}); + } + } else if (src.intermediate_col) { const auto &vec = *src.intermediate_col; - if (src.from_build) { - size_t k = start; - for (uint32_t rid : left_range) { - dest_col.write_at(k++, vec[rid]); + size_t k = start; + for (uint32_t rid : range) { + mema::value_t val = vec[rid]; + dest_col.write_at(k++, val); + } + } else if (src.needs_deferred_resolve && src.deferred_table) { + const auto &def_table = *src.deferred_table; + size_t k = start; + for (uint32_t rid : range) { + uint32_t base_row = def_table[rid]; + + if (analyzed_plan.original_plan) [[likely]] { + const auto &base_table = + analyzed_plan.original_plan + ->inputs[src.base_table_id]; + mema::value_t val = + columnar_reader.read_base_table_value( + base_table.columns[src.base_column_idx], + src.base_table_id, src.base_column_idx, + base_row, src.type, base_cursor); + dest_col.write_at(k++, val); + } else { + dest_col.write_at( + k++, mema::value_t{mema::value_t::NULL_VALUE}); } - } else { - size_t k = start; - for (uint32_t rid : right_range) { - dest_col.write_at(k++, vec[rid]); + } + } + } + + // Process DEFERRED tables + for (const auto &def_src : deferred_sources) { + auto &dest_table = + out_result.deferred_tables[def_src.dest_table_idx]; + + auto batch_reader = def_src.from_build ? buf.left_batch_reader() + : buf.right_batch_reader(); + + size_t k = start; + while (batch_reader.has_more()) { + size_t batch_count; + const uint32_t *row_ids = + batch_reader.get_batch(256, batch_count); + + if (batch_count > 0) { + if (def_src.needs_direct) { + row_id_ops::write_row_ids_direct(dest_table, k, row_ids, + batch_count); + } else if (def_src.child_table) { + row_id_ops::copy_row_ids_from_child( + dest_table, k, *def_src.child_table, row_ids, + batch_count); } + k += batch_count; } } } }); } -} // namespace Contest::materialize +} // namespace materialize +} // namespace Contest diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h index e154e93..4cff7ab 100644 --- a/include/materialization/materialize.h +++ b/include/materialization/materialize.h @@ -1,66 +1,149 @@ /** * @file materialize.h - * @brief Materialization of join results into ColumnarTable format. + * @brief Final materialization for execution path. * - * Parallel materialization using per-thread page builders and mmap allocation. - * Templated on MatchCollectionMode for zero-overhead mode selection. + * Materializes all output columns at the root join, resolving deferred + * columns by looking up 32-bit row IDs in DeferredTable back to base tables. + * + * @see construct_intermediate.h for building IntermediateResult intermediates. */ #pragma once -#include #include +#include +#include +#include + #include +#include #include -#include -#include -#include +#include #include -#include #include +#include #include -#include -#include -/** @namespace Contest::materialize @brief Join result materialization. */ -namespace Contest::materialize { +namespace Contest { +namespace materialize { -using Contest::ExecuteResult; using Contest::io::ColumnarReader; -using Contest::join::JoinInput; using Contest::join::MatchCollectionMode; -using Contest::join::resolve_input_source; using Contest::join::ThreadLocalMatchBuffer; using Contest::platform::THREAD_COUNT; using Contest::platform::worker_pool; -/** @brief Creates empty ColumnarTable with correct column types for zero-match - * case. */ +/** + * @brief Collect columns needed from a JoinInput for final materialization. + */ +inline platform::ArenaVector +collect_final_columns(const JoinInput &input, + const platform::ArenaVector &needed, + platform::ThreadArena &arena) { + platform::ArenaVector columns(arena); + if (!input.node) + return columns; + + columns.resize(input.node->output_attrs.size()); + std::memset(columns.data(), 0, columns.size() * sizeof(const Column *)); + + if (!input.is_columnar()) + return columns; + + auto *table = std::get(input.data); + for (size_t i = 0; i < input.node->output_attrs.size(); ++i) { + if (i < needed.size() && needed[i]) { + auto [actual_col_idx, _] = input.node->output_attrs[i]; + columns[i] = &table->columns[actual_col_idx]; + } + } + return columns; +} + +/** + * @brief Prepare ColumnarReader for final materialization at root. + * + * Sets up page indices for ALL output columns (since all need materialization + * at root). + */ +inline void prepare_final_columns( + ColumnarReader &reader, const JoinInput &build_input, + const JoinInput &probe_input, const AnalyzedJoinNode &join_node, + const std::vector> &remapped_attrs, + size_t build_size, bool build_is_left) { + + bool build_is_columnar = build_input.is_columnar(); + bool probe_is_columnar = probe_input.is_columnar(); + + if (!build_is_columnar && !probe_is_columnar) + return; + + auto &arena = Contest::platform::get_arena(0); + + // All output columns needed at root + platform::ArenaVector build_needed(arena); + if (build_input.node) { + build_needed.resize(build_input.node->output_attrs.size()); + std::memset(build_needed.data(), 0, build_needed.size()); + } + + platform::ArenaVector probe_needed(arena); + if (probe_input.node) { + probe_needed.resize(probe_input.node->output_attrs.size()); + std::memset(probe_needed.data(), 0, probe_needed.size()); + } + + // Mark ALL columns needed for final materialization + // from_left refers to original left child + // build_is_left tells us if build side is the left child + for (const auto &col : join_node.columns) { + bool from_build = (col.from_left == build_is_left); + if (from_build && col.child_output_idx < build_needed.size()) { + build_needed[col.child_output_idx] = 1; + } else if (!from_build && col.child_output_idx < probe_needed.size()) { + probe_needed[col.child_output_idx] = 1; + } + } + + if (build_is_columnar) { + reader.prepare_build( + collect_final_columns(build_input, build_needed, arena)); + } + + if (probe_is_columnar) { + reader.prepare_probe( + collect_final_columns(probe_input, probe_needed, arena)); + } +} + +/** + * @brief Create empty result for zero-match case. + */ inline ColumnarTable create_empty_result( - const std::vector> &remapped_attrs) { + const std::vector> &output_attrs) { ColumnarTable empty_result; empty_result.num_rows = 0; - for (auto [_, data_type] : remapped_attrs) { + for (auto [_, data_type] : output_attrs) { empty_result.columns.emplace_back(data_type); } return empty_result; } /** - * @brief Parallel materialization of a single output column from thread-local - * buffers. + * @brief Materialize a single column from sources. * - * Each thread processes its own buffer directly without merge overhead. + * Handles three cases: + * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index + * 2. MATERIALIZED: Column was materialized in IntermediateResult + * 3. DEFERRED: Resolve via 64-bit provenance to base table * - * @tparam Mode Collection mode for compile-time specialization. - * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. - * @tparam ReaderFunc Callable: (row_id, cursor) -> value_t. + * @tparam Mode Collection mode for compile-time specialization. + * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder. + * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t. * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType. - * @param est_bytes_per_row Average bytes per row (4 for INT32, ~35 for - * VARCHAR). */ template -inline void materialize_column_from_buffers( +inline void materialize_column( Column &dest_col, std::vector> &buffers, size_t total_matches, ReaderFunc &&read_value, InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) { @@ -162,133 +245,203 @@ inline void materialize_column_from_buffers( } /** - * @brief Materializes a single output column from thread-local buffers. + * @brief Materialize single output column handling deferred resolution. * - * Dispatcher that determines source location (columnar/intermediate, - * build/probe), selects page builder type, and invokes - * materialize_column_from_buffers<>. VARCHAR handling requires source Column - * pointer for string dereferencing. + * For deferred columns, resolves via DeferredTable (32-bit row ID) back to + * base table. * * @tparam Mode Collection mode for compile-time specialization. */ template -inline void materialize_single_column_from_buffers( - Column &dest_col, size_t col_idx, size_t build_size, +inline void materialize_single_column( + Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left, std::vector> &buffers, size_t total_matches, const JoinInput &build_input, const JoinInput &probe_input, - const PlanNode &build_node, const PlanNode &probe_node, - ColumnarReader &columnar_reader, const Plan &plan) { - - auto [input, node, local_idx] = resolve_input_source( - col_idx, build_size, build_input, build_node, probe_input, probe_node); - bool from_build = col_idx < build_size; + const AnalyzedJoinNode &join_node, ColumnarReader &columnar_reader, + const AnalyzedPlan &analyzed_plan) { + + // Find column info + const AnalyzedColumnInfo *col_info = nullptr; + for (const auto &col : join_node.columns) { + if (col.original_idx == col_idx) { + col_info = &col; + break; + } + } - const Column *col_source = nullptr; - const mema::column_t *inter_source = nullptr; + if (!col_info) { + // Fallback - shouldn't happen + return; + } - if (input.is_columnar()) { - auto *table = std::get(input.data); - auto [actual_idx, _] = node.output_attrs[local_idx]; - col_source = &table->columns[actual_idx]; + // Determine if this column comes from build or probe side at runtime + bool from_build = (col_info->from_left == build_is_left); + const JoinInput &src_input = from_build ? build_input : probe_input; + + // Determine how to read the value + const Column *columnar_source = nullptr; + const mema::column_t *materialized_source = nullptr; + const mema::key_row_column_t *tuple_source = nullptr; + const mema::DeferredTable *deferred_table = nullptr; + uint8_t deferred_base_col = 0; + uint8_t deferred_base_table = 0; + + if (src_input.is_columnar()) { + // Direct columnar read + const auto *table = std::get(src_input.data); + auto [actual_idx, _] = + src_input.node->output_attrs[col_info->child_output_idx]; + columnar_source = &table->columns[actual_idx]; } else { - const auto &res = std::get(input.data); - inter_source = &res[local_idx]; + const auto &ir = std::get(src_input.data); + // Check if column is stored as join key tuples + if (ir.is_join_key(col_info->child_output_idx)) { + tuple_source = &(*ir.join_key_tuples); + } else if (ir.is_materialized(col_info->child_output_idx)) { + // Read from materialized column + materialized_source = + ir.get_materialized(col_info->child_output_idx); + } else if (ir.is_deferred(col_info->child_output_idx)) { + // Deferred - need to resolve via deferred table + base table + deferred_table = ir.get_deferred_table(col_info->child_output_idx); + deferred_base_col = + ir.get_deferred_base_col(col_info->child_output_idx); + // Get base table ID from the deferred table itself + if (deferred_table) { + deferred_base_table = deferred_table->base_table_id; + } + } } - auto reader = [&](uint32_t rid, ColumnarReader::Cursor &cursor, - DataType type) { - if (col_source) { - return columnar_reader.read_value(*col_source, local_idx, rid, type, - cursor, from_build); + // Create reader lambda + auto reader = [&](uint32_t local_row_id, + ColumnarReader::Cursor &cursor) -> mema::value_t { + mema::value_t result; + if (columnar_source) { + result = columnar_reader.read_value( + *columnar_source, col_info->child_output_idx, local_row_id, + col_info->type, cursor, from_build); + } else if (tuple_source) { + // Read key value from tuple column + result = mema::value_t{tuple_source->key_at(local_row_id)}; + } else if (materialized_source) { + result = (*materialized_source)[local_row_id]; + } else if (deferred_table && analyzed_plan.original_plan) { + // Deferred resolution: look up base table row ID from deferred + // table + uint32_t base_row = (*deferred_table)[local_row_id]; + const auto &base_table = + analyzed_plan.original_plan->inputs[deferred_base_table]; + result = columnar_reader.read_value( + base_table.columns[deferred_base_col], deferred_base_col, + base_row, col_info->type, cursor, true); + } else { + result = mema::value_t{mema::value_t::NULL_VALUE}; } - return (*inter_source)[rid]; + return result; }; + // Materialize based on type if (dest_col.type == DataType::INT32) { auto init = [](std::function alloc) { return Int32PageBuilder(std::move(alloc)); }; - materialize_column_from_buffers( + materialize_column( dest_col, buffers, total_matches, [&](uint32_t rid, ColumnarReader::Cursor &cursor) { - return reader(rid, cursor, DataType::INT32); + return reader(rid, cursor); }, init, from_build, 4); return; } - const Column *str_src_ptr = col_source; - if (!str_src_ptr && inter_source) { - str_src_ptr = &plan.inputs[inter_source->source_table] - .columns[inter_source->source_column]; + // VARCHAR + const Column *str_src_ptr = columnar_source; + if (!str_src_ptr) { + if (materialized_source) { + str_src_ptr = &analyzed_plan.original_plan + ->inputs[materialized_source->source_table] + .columns[materialized_source->source_column]; + } else if (deferred_table && analyzed_plan.original_plan) { + // For deferred VARCHAR, get source from provenance metadata + str_src_ptr = + &analyzed_plan.original_plan->inputs[deferred_base_table] + .columns[deferred_base_col]; + } + } + + if (!str_src_ptr) { + // Shouldn't happen, but handle gracefully + return; } auto init = [str_src_ptr](std::function alloc) { return VarcharPageBuilder(*str_src_ptr, std::move(alloc)); }; - materialize_column_from_buffers( + materialize_column( dest_col, buffers, total_matches, [&](uint32_t rid, ColumnarReader::Cursor &cursor) { - return reader(rid, cursor, DataType::VARCHAR); + return reader(rid, cursor); }, init, from_build, 35); } /** - * @brief Materializes all output columns from thread-local buffers into - * ColumnarTable. - * - * Dereferences VARCHAR value_t references into actual string bytes. + * @brief Materialize all output columns from intermediate result. * - * @tparam Mode Collection mode for compile-time specialization. - * @param buffers Thread-local match buffers from probe. - * @param build_input Build side data source. - * @param probe_input Probe side data source. - * @param remapped_attrs Output projection: (col_idx, DataType) pairs. - * @param build_node Metadata for build side output_attrs mapping. - * @param probe_node Metadata for probe side output_attrs mapping. - * @param build_size Number of columns from build side. - * @param columnar_reader PageIndex-accelerated reader for Column page access. - * @param plan Full query plan for VARCHAR dereferencing. - * @return ColumnarTable with self-contained page data. + * For root join. Resolves all deferred columns by looking up 32-bit row IDs + * in DeferredTable back to base tables. * - * @see construct_intermediate.h for creating intermediate ExecuteResult. - * @see page_builders.h for Int32PageBuilder and VarcharPageBuilder. + * @tparam Mode Collection mode for compile-time specialization. + * @param buffers Thread-local match buffers from probe. + * @param build_input Build side input. + * @param probe_input Probe side input. + * @param join_node Analyzed join node with column info. + * @param remapped_attrs Output projection after build/probe remapping. + * @param build_size Number of columns from build side. + * @param columnar_reader Reader for columnar data. + * @param analyzed_plan Full analyzed plan for base table access. + * @return ColumnarTable with final output. */ template inline ColumnarTable materialize_from_buffers( std::vector> &buffers, const JoinInput &build_input, const JoinInput &probe_input, + const AnalyzedJoinNode &join_node, const std::vector> &remapped_attrs, - const PlanNode &build_node, const PlanNode &probe_node, size_t build_size, - ColumnarReader &columnar_reader, const Plan &plan) { + size_t build_size, bool build_is_left, ColumnarReader &columnar_reader, + const AnalyzedPlan &analyzed_plan) { - // Compute total_matches + // Compute total matches size_t total_matches = 0; for (const auto &buf : buffers) { total_matches += buf.count(); } - ColumnarTable result; - result.num_rows = total_matches; - if (total_matches == 0) { - for (auto [_, dtype] : remapped_attrs) { - result.columns.emplace_back(dtype); - } - return result; + return create_empty_result(remapped_attrs); } + ColumnarTable result; + result.num_rows = total_matches; + for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) { auto [col_idx, data_type] = remapped_attrs[out_idx]; result.columns.emplace_back(data_type); Column &dest_col = result.columns.back(); - materialize_single_column_from_buffers( - dest_col, col_idx, build_size, buffers, total_matches, build_input, - probe_input, build_node, probe_node, columnar_reader, plan); + + // Pass out_idx (output position) not col_idx (global column index) + // because materialize_single_column searches by original_idx + // which is the output position in join_node.columns + materialize_single_column(dest_col, out_idx, build_size, + build_is_left, buffers, total_matches, + build_input, probe_input, join_node, + columnar_reader, analyzed_plan); } + return result; } -} // namespace Contest::materialize +} // namespace materialize +} // namespace Contest diff --git a/include/platform/arena.h b/include/platform/arena.h index f1aa32e..59d3442 100644 --- a/include/platform/arena.h +++ b/include/platform/arena.h @@ -41,12 +41,13 @@ static constexpr size_t PAGE_2MB = 2 * 1024 * 1024; * @brief Chunk type enumeration for arena regions. */ enum class ChunkType : uint8_t { - HASH_CHUNK = 0, ///< 4KB - hash table partition chunks - IR_PAGE = 1, ///< 16KB - intermediate result pages - INDEX_CHUNK = 2, ///< 32KB - match collector index chunks - GENERAL = 3, ///< Variable - misc allocations + HASH_CHUNK = 0, ///< 4KB - hash table partition chunks + IR_PAGE = 1, ///< 16KB - intermediate result pages (32-bit values) + INDEX_CHUNK = 2, ///< 32KB - match collector index chunks + DEFERRED_PAGE = 3, ///< 32KB - deferred provenance pages (64-bit values) + GENERAL = 4, ///< Variable - misc allocations - NUM_TYPES = 4 + NUM_TYPES = 5 }; // ============================================================================ @@ -67,12 +68,15 @@ template <> struct ChunkSize { template <> struct ChunkSize { static constexpr size_t value = 32768; }; +template <> struct ChunkSize { + static constexpr size_t value = 32768; +}; template <> struct ChunkSize { static constexpr size_t value = 0; }; /// Runtime chunk size array indexed by ChunkType. -inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 0}; +inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 32768, 0}; // ============================================================================ // Page Policies @@ -92,6 +96,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = { PagePolicy::SMALL_PAGES, // HASH_CHUNK PagePolicy::HUGE_PAGES, // IR_PAGE PagePolicy::HUGE_PAGES, // INDEX_CHUNK + PagePolicy::HUGE_PAGES, // DEFERRED_PAGE PagePolicy::HUGE_PAGES, // GENERAL }; @@ -102,7 +107,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = { /** * @brief Region size configuration based on available DRAM. * - * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (25%) among 4 regions. + * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (20%) among 5 regions. */ struct RegionConfig { size_t total_arena_bytes; @@ -113,8 +118,8 @@ struct RegionConfig { 1024ULL * 1024ULL * 3ULL / 4ULL; } - /// Get total size for a region (25% each). - size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 4; } + /// Get total size for a region (20% each). + size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 5; } /// Get total arena size. size_t total() const { return total_arena_bytes; } @@ -450,7 +455,8 @@ class ArenaManager { // Global Instance and Helper // ============================================================================ -/// Global arena manager instance (inline global, constructed at program startup). +/// Global arena manager instance (inline global, constructed at program +/// startup). inline ArenaManager g_arena_manager{}; /// Get thread arena by thread ID. diff --git a/include/platform/hardware.h b/include/platform/hardware.h index 83ef443..0cbb011 100644 --- a/include/platform/hardware.h +++ b/include/platform/hardware.h @@ -10,8 +10,8 @@ */ #pragma once -#define SPC__CORE_COUNT 8 -#define SPC__THREAD_COUNT 16 +#define SPC__CORE_COUNT 6 +#define SPC__THREAD_COUNT 6 #define SPC__LEVEL1_DCACHE_SIZE 32768 #define SPC__LEVEL2_CACHE_SIZE 1048576 #define SPC__LEVEL3_CACHE_SIZE 33554432 diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp new file mode 100644 index 0000000..3c7fc83 --- /dev/null +++ b/src/analyze_plan.cpp @@ -0,0 +1,302 @@ +/** + * @file analyze_plan.cpp + * @brief Analyzes query plan and computes materialization decisions. + * + * Walks the plan tree in post-order to determine which columns should be + * materialized eagerly (join keys needed by parent) vs deferred until final + * output. Traces column provenance back to base tables for deferred resolution. + * + * @see deferred_plan.h for AnalyzedPlan structure. + */ +#include +#include + +#include + +namespace Contest { + +namespace { + +/** + * @brief Parent relationship info for a node. + */ +struct ParentInfo { + size_t parent_idx; ///< Parent node index in Plan::nodes. + bool is_left_child; ///< True if this node is parent's left child. +}; + +/** + * @brief Build map of node_idx → parent info. + * + * Root node will not have an entry in the map. + */ +std::unordered_map build_parent_map(const Plan &plan) { + std::unordered_map parent_map; + + for (size_t i = 0; i < plan.nodes.size(); ++i) { + const auto &node = plan.nodes[i]; + if (const auto *join = std::get_if(&node.data)) { + parent_map[join->left] = {i, true}; + parent_map[join->right] = {i, false}; + } + } + return parent_map; +} + +/** + * @brief Trace column provenance to base table. + * + * Recursively follows column through join nodes until reaching a scan node. + * + * @param plan Original query plan. + * @param node_idx Current node index. + * @param column_idx Column index in node's output_attrs. + * @return ColumnProvenance with base table ID and column index. + */ +ColumnProvenance trace_provenance(const Plan &plan, size_t node_idx, + size_t column_idx) { + const auto &node = plan.nodes[node_idx]; + + if (const auto *scan = std::get_if(&node.data)) { + // Base case: column comes directly from scan + auto [actual_col_idx, _] = node.output_attrs[column_idx]; + return ColumnProvenance{static_cast(scan->base_table_id), + static_cast(actual_col_idx)}; + } + + // Join node: determine which child the column comes from + const auto &join = std::get(node.data); + const auto &left_node = plan.nodes[join.left]; + size_t left_size = left_node.output_attrs.size(); + + auto [col_idx, _] = node.output_attrs[column_idx]; + + if (col_idx < left_size) { + // Column from left child + return trace_provenance(plan, join.left, col_idx); + } else { + // Column from right child + return trace_provenance(plan, join.right, col_idx - left_size); + } +} + +/** + * @brief Find which column index in this node the parent needs as join key. + * + * @param plan Original query plan. + * @param node_idx Current node index. + * @param parent_map Map of node → parent relationship. + * @return Column index parent uses as join key, or nullopt if root. + */ +std::optional +find_parent_join_key(const Plan &plan, size_t node_idx, + const std::unordered_map &parent_map) { + auto it = parent_map.find(node_idx); + if (it == parent_map.end()) { + return std::nullopt; // Root node + } + + const auto &parent_node = plan.nodes[it->second.parent_idx]; + const auto &parent_join = std::get(parent_node.data); + + // Parent's join key for this child + return it->second.is_left_child ? parent_join.left_attr + : parent_join.right_attr; +} + +/** + * @brief Compute base collection mode based on which sides have output columns. + * + * Assumes build=left. If build=right at runtime, caller flips + * LEFT_ONLY/RIGHT_ONLY. + */ +join::MatchCollectionMode +compute_base_collection_mode(const std::vector &columns, + size_t left_output_size) { + bool needs_left = false; + bool needs_right = false; + + for (const auto &col : columns) { + if (col.from_left) { + needs_left = true; + } else { + needs_right = true; + } + if (needs_left && needs_right) { + return join::MatchCollectionMode::BOTH; + } + } + + if (needs_left && !needs_right) + return join::MatchCollectionMode::LEFT_ONLY; + if (needs_right && !needs_left) + return join::MatchCollectionMode::RIGHT_ONLY; + return join::MatchCollectionMode::BOTH; +} + +} // anonymous namespace + +AnalyzedPlan analyze_plan(const Plan &plan) { + AnalyzedPlan analyzed; + analyzed.original_plan = &plan; + analyzed.nodes.resize(plan.nodes.size()); + analyzed.root = plan.root; + + auto parent_map = build_parent_map(plan); + + // Build post-order traversal (children before parents) + std::vector post_order; + post_order.reserve(plan.nodes.size()); + std::vector visited(plan.nodes.size(), false); + + std::function visit = [&](size_t idx) { + if (visited[idx]) + return; + visited[idx] = true; + + const auto &node = plan.nodes[idx]; + if (const auto *join = std::get_if(&node.data)) { + visit(join->left); + visit(join->right); + } + post_order.push_back(idx); + }; + visit(plan.root); + + // PASS 1: Build structure and initial materialization decisions + for (size_t node_idx : post_order) { + const auto &node = plan.nodes[node_idx]; + + if (const auto *scan = std::get_if(&node.data)) { + // Scan node: simple wrapper + AnalyzedScanNode ascan; + ascan.node_idx = node_idx; + ascan.base_table_id = scan->base_table_id; + ascan.output_attrs = node.output_attrs; + analyzed.nodes[node_idx] = std::move(ascan); + + } else { + // Join node: compute materialization decisions + const auto &join = std::get(node.data); + AnalyzedJoinNode ajoin; + ajoin.node_idx = node_idx; + ajoin.left_child_idx = join.left; + ajoin.right_child_idx = join.right; + ajoin.left_join_attr = join.left_attr; + ajoin.right_join_attr = join.right_attr; + ajoin.output_attrs = node.output_attrs; + ajoin.is_root = (node_idx == plan.root); + + // Find which column parent needs as join key + ajoin.parent_join_key_idx = + find_parent_join_key(plan, node_idx, parent_map); + + // Get child sizes for determining column source + const auto &left_node = plan.nodes[join.left]; + size_t left_size = left_node.output_attrs.size(); + + // Build column info for each output column + for (size_t i = 0; i < node.output_attrs.size(); ++i) { + auto [col_idx, col_type] = node.output_attrs[i]; + + AnalyzedColumnInfo info; + info.original_idx = i; + info.type = col_type; + + // Determine if column is from left or right child + // col_idx is the combined L+R index: + // - [0, left_size) = position in left child's output + // - [left_size, ...) = position in right child's output + + // left_size + if (col_idx < left_size) { + info.from_left = true; + info.child_output_idx = col_idx; + } else { + info.from_left = false; + info.child_output_idx = col_idx - left_size; + } + + // Materialization decision: + // - At root: ALL columns must be materialized (final output) + // - At intermediate: only parent's join key is materialized + if (ajoin.is_root) { + // Root node: materialize everything + info.resolution = ColumnResolution::MATERIALIZE; + } else if (ajoin.parent_join_key_idx.has_value() && + i == *ajoin.parent_join_key_idx) { + info.resolution = ColumnResolution::MATERIALIZE; + } else { + info.resolution = ColumnResolution::DEFER; + } + + // Trace provenance to base table + info.provenance = trace_provenance(plan, node_idx, i); + + ajoin.columns.push_back(std::move(info)); + } + + // Compute collection mode and count deferred columns + ajoin.base_collection_mode = + compute_base_collection_mode(ajoin.columns, left_size); + + // Count deferred columns for pre-allocation + ajoin.num_deferred_columns = 0; + for (const auto &col : ajoin.columns) { + if (col.resolution == ColumnResolution::DEFER) { + ++ajoin.num_deferred_columns; + } + } + + analyzed.nodes[node_idx] = std::move(ajoin); + } + } + + // PASS 2: Propagate materialization requirements to children + // Process in reverse post-order (parents before children) + for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) { + size_t node_idx = *it; + auto *ajoin = std::get_if(&analyzed.nodes[node_idx]); + if (!ajoin) + continue; + + // For each column that must be MATERIALIZE, ensure the child also + // materializes it + for (const auto &col : ajoin->columns) { + if (col.resolution != ColumnResolution::MATERIALIZE) + continue; + + // Find which child this column comes from + size_t child_idx = + col.from_left ? ajoin->left_child_idx : ajoin->right_child_idx; + + auto *child_ajoin = + std::get_if(&analyzed.nodes[child_idx]); + if (!child_ajoin) + continue; // Child is a scan - always has data + + // Mark child's column as MATERIALIZE + if (col.child_output_idx < child_ajoin->columns.size()) { + child_ajoin->columns[col.child_output_idx].resolution = + ColumnResolution::MATERIALIZE; + } + } + } + + // PASS 3: Recount num_deferred_columns after propagation + for (size_t node_idx : post_order) { + auto *ajoin = std::get_if(&analyzed.nodes[node_idx]); + if (!ajoin) + continue; + + ajoin->num_deferred_columns = 0; + for (const auto &col : ajoin->columns) { + if (col.resolution == ColumnResolution::DEFER) { + ++ajoin->num_deferred_columns; + } + } + } + + return analyzed; +} + +} // namespace Contest diff --git a/src/execute.cpp b/src/execute.cpp index c5a3eed..ce81a31 100644 --- a/src/execute.cpp +++ b/src/execute.cpp @@ -5,18 +5,25 @@ * Traverses plan tree: resolve inputs -> select build/probe -> algorithm * selection -> match collection -> output construction. * - * Flow: execute() -> execute_impl() recursively -> resolve_join_input() for - * ScanNode (ColumnarTable*) or JoinNode (ExecuteResult). Root produces - * ColumnarTable; non-root produces ExecuteResult. + * Flow: execute() -> execute_impl() recursively -> resolve_input() for + * ScanNode (ColumnarTable*) or JoinNode (IntermediateResult). Root produces + * ColumnarTable; non-root produces IntermediateResult. * - * Lifetimes: base tables live for query duration; ExecuteResult held on stack - * until parent completes; VARCHAR refs valid via base table lifetime. + * Lifetimes: base tables live for query duration; IntermediateResult held on + * stack until parent completes; VARCHAR refs valid via base table lifetime. * * Row order non-deterministic (work-stealing); semantically correct per SQL. * * @see plan.h, match_collector.h, materialize.h, construct_intermediate.h */ +#include "data_model/plan.h" +#include #include +#include +#include +#include +#include +#include #if defined(__APPLE__) && defined(__aarch64__) #include #elif defined(SPC__USE_BENCHMARKVM_HARDWARE) @@ -27,6 +34,7 @@ #include #include +#include #include #include #include @@ -44,71 +52,87 @@ namespace Contest { using namespace join; -using materialize::construct_intermediate_from_buffers; -using materialize::create_empty_result; +using materialize::construct_intermediate_with_tuples; +using materialize::create_empty_intermediate_result; using materialize::materialize_from_buffers; /** - * @brief Result variant: ExecuteResult (intermediate, value_t columns) or - * ColumnarTable (final output per contest API). + * @brief Result variant: IntermediateResult (non-root) or ColumnarTable (root). */ -using JoinResult = std::variant; +using JoinResult = std::variant; -/** - * @brief Recursive join execution with timing. - * @param plan Query plan with nodes and base tables. - * @param node_idx Current node index in plan.nodes. - * @param is_root True -> ColumnarTable output; false -> ExecuteResult. - * @param stats Timing accumulator. - * @return JoinResult (intermediate or final). - */ -JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, +// Forward declaration +JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, TimingStats &stats); /** * @brief Resolve plan node to JoinInput. * * ScanNode -> non-owning ColumnarTable*; JoinNode -> recursive execution - * returning owned ExecuteResult. Implements depth-first traversal. - * - * @param plan Query plan. - * @param node_idx Node index to resolve. - * @param stats Timing accumulator. - * @return JoinInput with data variant and metadata. + * returning owned IntermediateResult. */ -JoinInput resolve_join_input(const Plan &plan, size_t node_idx, - TimingStats &stats) { +JoinInput resolve_input(const AnalyzedPlan &plan, size_t node_idx, + TimingStats &stats) { JoinInput input; - const auto &node = plan.nodes[node_idx]; - input.node = &node; + const auto &anode = plan[node_idx]; + const auto &pnode = plan.original_plan->nodes[node_idx]; + input.node = &pnode; + input.analyzed_node = &anode; - if (const auto *scan = std::get_if(&node.data)) { - input.data = &plan.inputs[scan->base_table_id]; + if (const auto *scan = std::get_if(&anode)) { + input.data = &plan.original_plan->inputs[scan->base_table_id]; input.table_id = scan->base_table_id; } else { auto result = execute_impl(plan, node_idx, false, stats); - input.data = std::get(std::move(result)); + input.data = std::get(std::move(result)); input.table_id = 0; } return input; } +/** + * @brief Select build/probe sides for join input. + */ +BuildProbeConfig select_join_build_probe_side( + const JoinNode &join, const JoinInput &left_input, + const JoinInput &right_input, + const std::vector> &output_attrs) { + BuildProbeConfig config; + + size_t left_rows = left_input.row_count(join.left_attr); + size_t right_rows = right_input.row_count(join.right_attr); + config.build_left = left_rows <= right_rows; + + config.build_attr = config.build_left ? join.left_attr : join.right_attr; + config.probe_attr = config.build_left ? join.right_attr : join.left_attr; + + config.remapped_attrs = output_attrs; + size_t left_size = left_input.output_size(); + size_t build_size = + config.build_left ? left_size : right_input.output_size(); + + if (!config.build_left) { + for (auto &[col_idx, dtype] : config.remapped_attrs) { + if (col_idx < left_size) { + col_idx = build_size + col_idx; + } else { + col_idx = col_idx - left_size; + } + } + } + return config; +} + /** * @brief Unified probe + materialize helper templated on collection mode. - * - * Executes probe (nested loop or hash join) and materialization/intermediate - * construction in a single function. Template parameter eliminates runtime - * branching in hot loops. - * - * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY). */ template JoinResult execute_join_with_mode( bool use_nested_loop, bool probe_is_columnar, bool is_root, const UnchainedHashtable *hash_table, const JoinInput &build_input, const JoinInput &probe_input, const BuildProbeConfig &config, - const PlanNode &build_node, const PlanNode &probe_node, JoinSetup &setup, - io::ColumnarReader &columnar_reader, const Plan &plan, TimingStats &stats) { + const AnalyzedJoinNode &join_node, io::ColumnarReader &columnar_reader, + const AnalyzedPlan &plan, TimingStats &stats) { std::vector> match_buffers; @@ -128,9 +152,31 @@ JoinResult execute_join_with_mode( config.probe_attr); } else { const auto &probe_result = - std::get(probe_input.data); - match_buffers = probe_intermediate( - *hash_table, probe_result[config.probe_attr]); + std::get(probe_input.data); + + // Use tuple-based probe if available + if (probe_result.has_join_key_tuples() && + probe_result.join_key_idx.has_value() && + *probe_result.join_key_idx == config.probe_attr) { + match_buffers = probe_tuples( + *hash_table, *probe_result.join_key_tuples); + } else { + // Fall back to materialized column probe + const auto *mat_col = + probe_result.get_materialized(config.probe_attr); + if (!mat_col) { + std::fprintf( + stderr, + "ERROR: probe join key not materialized! " + "probe_attr=%zu " + "mat_map_size=%zu num_rows=%zu has_tuples=%d\n", + config.probe_attr, probe_result.materialized_map.size(), + probe_result.num_rows, + probe_result.has_join_key_tuples() ? 1 : 0); + std::abort(); + } + match_buffers = probe_intermediate(*hash_table, *mat_col); + } } auto probe_end = std::chrono::high_resolution_clock::now(); stats.hash_join_probe_ms += @@ -148,16 +194,19 @@ JoinResult execute_join_with_mode( auto mat_start = std::chrono::high_resolution_clock::now(); JoinResult final_result; if (total_matches == 0) { - final_result = create_empty_result(config.remapped_attrs); + final_result = + materialize::create_empty_result(config.remapped_attrs); } else { - prepare_output_columns( - columnar_reader, build_input, probe_input, build_node, - probe_node, config.remapped_attrs, build_input.output_size()); + // Prepare page indices for final materialization + materialize::prepare_final_columns( + columnar_reader, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left); final_result = materialize_from_buffers( - match_buffers, build_input, probe_input, config.remapped_attrs, - build_node, probe_node, build_input.output_size(), - columnar_reader, plan); + match_buffers, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left, columnar_reader, plan); } auto mat_end = std::chrono::high_resolution_clock::now(); stats.materialize_ms += @@ -167,93 +216,110 @@ JoinResult execute_join_with_mode( return final_result; } else { auto inter_start = std::chrono::high_resolution_clock::now(); + IntermediateResult result; if (total_matches > 0) { - prepare_output_columns( - columnar_reader, build_input, probe_input, build_node, - probe_node, config.remapped_attrs, build_input.output_size()); - - construct_intermediate_from_buffers( - match_buffers, build_input, probe_input, config.remapped_attrs, - build_node, probe_node, build_input.output_size(), - columnar_reader, setup.results); + materialize::prepare_intermediate_columns( + columnar_reader, build_input, probe_input, join_node, + config.remapped_attrs, build_input.output_size(), + config.build_left, join_node.parent_join_key_idx); + + construct_intermediate_with_tuples( + match_buffers, build_input, probe_input, join_node, config, + config.build_left, *join_node.parent_join_key_idx, + columnar_reader, result, plan); + } else { + result = create_empty_intermediate_result(join_node); } auto inter_end = std::chrono::high_resolution_clock::now(); stats.intermediate_ms += std::chrono::duration_cast(inter_end - inter_start) .count(); - return std::move(setup.results); + return std::move(result); } } /** - * @brief Core recursive join execution. - * - * Phases: resolve L/R inputs -> select build/probe (smaller=build) -> algorithm - * choice -> build/probe -> output construction. - * - * Algorithm: nested loop if build_rows < HASH_TABLE_THRESHOLD (8); else radix- - * partitioned hash join. - * - * Memory: hash table and MatchCollector local (freed on return); child - * ExecuteResults on stack until materialization; setup.results pre-allocated. + * @brief Recursive join execution. */ -JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, +JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root, TimingStats &stats) { - auto &node = plan.nodes[node_idx]; + const auto &anode = plan[node_idx]; - if (!std::holds_alternative(node.data)) { - return ExecuteResult{}; + if (std::holds_alternative(anode)) { + return IntermediateResult{}; } - const auto &join = std::get(node.data); - const auto &output_attrs = node.output_attrs; - const auto &left_node = plan.nodes[join.left]; - const auto &right_node = plan.nodes[join.right]; + const auto &ajoin = std::get(anode); + const auto &original_plan = *plan.original_plan; + const auto &pnode = original_plan.nodes[node_idx]; + const auto &join = std::get(pnode.data); - JoinInput left_input = resolve_join_input(plan, join.left, stats); - JoinInput right_input = resolve_join_input(plan, join.right, stats); + // Resolve inputs + JoinInput left_input = resolve_input(plan, ajoin.left_child_idx, stats); + JoinInput right_input = resolve_input(plan, ajoin.right_child_idx, stats); - /* Build/probe selection: smaller input = build side; remaps output_attrs. - */ + // Build/probe selection auto setup_start = std::chrono::high_resolution_clock::now(); - auto config = - select_build_probe_side(join, left_input, right_input, output_attrs); + auto config = select_join_build_probe_side(join, left_input, right_input, + ajoin.output_attrs); const JoinInput &build_input = config.build_left ? left_input : right_input; const JoinInput &probe_input = config.build_left ? right_input : left_input; - const auto &build_node = config.build_left ? left_node : right_node; - const auto &probe_node = config.build_left ? right_node : left_node; bool build_is_columnar = build_input.is_columnar(); bool probe_is_columnar = probe_input.is_columnar(); - /* Nested loop for <8 rows (L1-resident, no hash overhead, SIMD). */ const size_t HASH_TABLE_THRESHOLD = 8; size_t build_rows = build_input.row_count(config.build_attr); bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD); - /* Pre-allocate ExecuteResult; ColumnarReader PageIndex built lazily. */ - JoinSetup setup = setup_join(build_input, probe_input, build_node, - probe_node, left_node, right_node, left_input, - right_input, output_attrs, build_rows); + io::ColumnarReader columnar_reader; auto setup_end = std::chrono::high_resolution_clock::now(); - auto setup_elapsed = std::chrono::duration_cast( - setup_end - setup_start); - stats.setup_ms += setup_elapsed.count(); - - /* Skip unused-side row IDs if output needs only one side (50% savings). */ - MatchCollectionMode collection_mode = determine_collection_mode( - config.remapped_attrs, config.build_left ? left_input.output_size() - : right_input.output_size()); + stats.setup_ms += std::chrono::duration_cast( + setup_end - setup_start) + .count(); + + // Use pre-computed collection mode from plan analysis. + // base_collection_mode assumes build=left; flip if build=right at runtime. + MatchCollectionMode mode = ajoin.base_collection_mode; + if (!config.build_left) { + if (mode == MatchCollectionMode::LEFT_ONLY) + mode = MatchCollectionMode::RIGHT_ONLY; + else if (mode == MatchCollectionMode::RIGHT_ONLY) + mode = MatchCollectionMode::LEFT_ONLY; + } - /* Build hash table if needed (before mode dispatch). */ + // Build hash table if needed std::optional hash_table; if (!use_nested_loop) { auto build_start = std::chrono::high_resolution_clock::now(); - hash_table = - build_is_columnar - ? build_from_columnar(build_input, config.build_attr) - : build_from_intermediate(build_input, config.build_attr); + if (build_is_columnar) { + hash_table = build_from_columnar(build_input, config.build_attr); + } else { + const auto &ir = std::get(build_input.data); + + // Use tuple-based build if available and matches build_attr + if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() && + *ir.join_key_idx == config.build_attr) { + hash_table.emplace(ir.join_key_tuples->row_count()); + hash_table->build_from_tuples(*ir.join_key_tuples); + } else { + // Fall back to materialized column build + const auto *mat_col = ir.get_materialized(config.build_attr); + if (!mat_col) { + std::fprintf( + stderr, + "ERROR: build join key not materialized! " + "build_attr=%zu " + "mat_map_size=%zu num_rows=%zu has_tuples=%d\n", + config.build_attr, ir.materialized_map.size(), + ir.num_rows, ir.has_join_key_tuples() ? 1 : 0); + std::abort(); + } + hash_table.emplace(mat_col->row_count()); + hash_table->build_intermediate(*mat_col); + } + } auto build_end = std::chrono::high_resolution_clock::now(); stats.hashtable_build_ms += std::chrono::duration_cast(build_end - @@ -261,36 +327,91 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root, .count(); } - /* Dispatch based on collection mode - single runtime branch, then - * fully specialized template instantiation with zero branching in hot - * loops. */ - switch (collection_mode) { + // Dispatch based on collection mode + switch (mode) { case MatchCollectionMode::BOTH: return execute_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, build_node, probe_node, setup, - setup.columnar_reader, plan, stats); + probe_input, config, ajoin, columnar_reader, plan, stats); case MatchCollectionMode::LEFT_ONLY: return execute_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, build_node, probe_node, setup, - setup.columnar_reader, plan, stats); + probe_input, config, ajoin, columnar_reader, plan, stats); case MatchCollectionMode::RIGHT_ONLY: return execute_join_with_mode( use_nested_loop, probe_is_columnar, is_root, use_nested_loop ? nullptr : &(*hash_table), build_input, - probe_input, config, build_node, probe_node, setup, - setup.columnar_reader, plan, stats); + probe_input, config, ajoin, columnar_reader, plan, stats); } - // Should never reach here, but satisfy compiler - return ExecuteResult{}; + return IntermediateResult{}; } + +/** + * + * @brief Prints the plan tree with metadata. + * + * @param the query plan itself. + * @param queue that should contain the root node. + * + **/ +static std::function>, int)> +print_plan = [](const Plan& plan, std::queue> q, int table_id) { + if (q.empty()) return; + int initial_size = q.size(); + for (int i = 0; i < initial_size; i++) { + auto [node_idx, parent_attr] = q.front(); + q.pop(); + const auto& node = plan.nodes[node_idx]; + if (std::holds_alternative(node.data)) { + continue; + } + const auto data = std::get(node.data); + + std::cout << " node: "<< node_idx << " size: " + << node.output_attrs.size() << std::endl; + + bool match_left = false; + bool match_right = false; + for (int i = 0; i < node.output_attrs.size(); i++) { + auto [col, type] = node.output_attrs[i]; + if (node_idx != plan.root) { + if (i == parent_attr) std::cout << "build->"; + else std::cout << "defer->"; + } + if (col < plan.nodes[data.left].output_attrs.size()) { + std::cout << "left->"; + match_left = true; + } else { + std::cout << "right->"; + match_right = true; + } + if (DataType::INT32 == type) std::cout << "(" << col << ", INT32)"; + else std::cout << "(" << col << ", STR)"; + std::cout << std::endl; + + } + std::cout << "===="; + if (match_left && match_right) std::cout << "Match both"; + else if (match_left) std::cout << "Match left"; + else std::cout << "Match right"; + std::cout << "====" << std::endl; + + std::cout << "left_key: " << data.left_attr << " left child: " << data.left; + std::cout << "\nright_key: " << data.right_attr << " right child: " << data.right; + q.emplace(data.left, data.left_attr); + q.emplace(data.right, data.right_attr); + std::cout << "\n\n\n\n\n"; + } + print_plan(plan, std::move(q)); +}; + + /** * @brief Public entry point: execute plan from root, return ColumnarTable. * @param plan Query plan with nodes and base tables. @@ -307,7 +428,22 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, auto total_start = std::chrono::high_resolution_clock::now(); TimingStats stats; - auto result = execute_impl(plan, plan.root, true, stats); + + // Analyze plan and execute with deferred intermediate construction + auto analyze_start = std::chrono::high_resolution_clock::now(); + AnalyzedPlan analyzed_plan = analyze_plan(plan); + auto analyze_end = std::chrono::high_resolution_clock::now(); + stats.analyze_plan_ms = + std::chrono::duration_cast(analyze_end - + analyze_start) + .count(); + /* + auto result = execute_impl(analyzed_plan, plan.root, true, stats); + ColumnarTable final_result = std::get(std::move(result)); + */ + std::queue> q; + q.emplace(plan.root, 0); + print_plan(plan, q); auto total_end = std::chrono::high_resolution_clock::now(); auto total_elapsed = std::chrono::duration_cast( @@ -317,9 +453,11 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, if (show_detailed_timing) { int64_t accounted = stats.hashtable_build_ms + stats.hash_join_probe_ms + - stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms; + stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms + + stats.intermediate_ms + stats.analyze_plan_ms; int64_t other = stats.total_execution_ms - accounted; + std::cout << "Plan Analysis Time: " << stats.analyze_plan_ms << " ms\n"; std::cout << "Hashtable Build Time: " << stats.hashtable_build_ms << " ms\n"; std::cout << "Hash Join Probe Time: " << stats.hash_join_probe_ms @@ -339,7 +477,7 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out, *stats_out = stats; } - return std::move(std::get(result)); + return ColumnarTable(); } void *build_context() { return nullptr; }