From 8671c489f6c24238fcb94170c0125cbd5007c2fd Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 14 Nov 2025 18:08:29 +0000 Subject: [PATCH 01/75] TPCH-derived Q3 Only lightly tested --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 24 + cpp/benchmarks/streaming/ndsh/q03.cpp | 957 +++++++++++++++++++ 2 files changed, 981 insertions(+) create mode 100644 cpp/benchmarks/streaming/ndsh/q03.cpp diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 6fa4bd27b..18b618385 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -53,12 +53,36 @@ target_link_libraries( q09 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ $ maybe_asan ) +add_executable(q03 "q03.cpp") +set_target_properties( + q03 + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON +) +target_compile_options( + q03 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" + "$<$:${RAPIDSMPF_CUDA_FLAGS}>" +) +target_link_libraries( + q03 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ + $ maybe_asan +) + install( TARGETS rapidsmpfndsh COMPONENT benchmarking DESTINATION ${lib_dir} EXCLUDE_FROM_ALL ) +install( + TARGETS q03 + COMPONENT benchmarking + DESTINATION bin/benchmarks/librapidsmpf + EXCLUDE_FROM_ALL +) install( TARGETS q09 COMPONENT benchmarking diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp new file mode 100644 index 000000000..8059f3c30 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -0,0 +1,957 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "concatenate.hpp" +#include "join.hpp" +#include "rapidsmpf/cuda_stream.hpp" +#include "rapidsmpf/owning_wrapper.hpp" +#include "utilities.hpp" + +// select +// l_orderkey, +// sum(l_extendedprice * (1 - l_discount)) as revenue, +// o_orderdate, +// o_shippriority +// from +// customer, +// orders, +// lineitem +// where +// c_mktsegment = 'BUILDING' +// and c_custkey = o_custkey +// and l_orderkey = o_orderkey +// and o_orderdate < '1995-03-15' +// and l_shipdate > '1995-03-15' +// group by +// l_orderkey, +// o_orderdate, +// o_shippriority +// order by +// revenue desc, +// o_orderdate +// limit 10 + +namespace { + +std::string get_table_path( + std::string const& input_directory, std::string const& table_name +) { + auto dir = input_directory.empty() ? "." : input_directory; + auto file_path = dir + "/" + table_name + ".parquet"; + + if (std::filesystem::exists(file_path)) { + return file_path; + } + + return dir + "/" + table_name + "/"; +} + +rapidsmpf::streaming::Node read_customer( + std::shared_ptr ctx, + std::shared_ptr ch_out, + std::size_t num_producers, + cudf::size_type num_rows_per_chunk, + std::string const& input_directory +) { + auto files = rapidsmpf::ndsh::detail::list_parquet_files( + get_table_path(input_directory, "customer") + ); + auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) + .columns({"c_custkey"}) // 0 + .build(); + auto filter_expr = [&]() -> std::unique_ptr { + auto stream = ctx->br()->stream_pool().get_stream(); + auto owner = new std::vector; + owner->push_back(std::make_shared("BUILDING", true, stream)); + owner->push_back( + std::make_shared( + *std::any_cast>(owner->at(0)) + ) + ); + owner->push_back( + std::make_shared("c_mktsegment") + ); + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::EQUAL, + *std::any_cast>( + owner->at(2) + ), + *std::any_cast>(owner->at(1)) + ) + ); + return std::make_unique( + stream, + *std::any_cast>(owner->back()), + rapidsmpf::OwningWrapper(static_cast(owner), [](void* p) { + delete static_cast*>(p); + }) + ); + }(); + return rapidsmpf::streaming::node::read_parquet( + ctx, ch_out, num_producers, options, num_rows_per_chunk, std::move(filter_expr) + ); +} + +[[maybe_unused]] rapidsmpf::streaming::Node read_lineitem( + std::shared_ptr ctx, + std::shared_ptr ch_out, + std::size_t num_producers, + cudf::size_type num_rows_per_chunk, + std::string const& input_directory +) { + auto files = rapidsmpf::ndsh::detail::list_parquet_files( + get_table_path(input_directory, "lineitem") + ); + auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) + .columns({ + "l_orderkey", // 0 + "l_extendedprice", // 1 + "l_discount", // 2 + }) + .build(); + auto filter_expr = [&]() -> std::unique_ptr { + auto stream = ctx->br()->stream_pool().get_stream(); + auto owner = new std::vector; + constexpr auto date = cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1995), + cuda::std::chrono::month(3), + cuda::std::chrono::day(15) + ); + auto sys_days = cuda::std::chrono::sys_days(date); + owner->push_back( + std::make_shared>( + sys_days, true, stream + ) + ); + owner->push_back( + std::make_shared( + *std::any_cast< + std::shared_ptr>>( + owner->at(0) + ) + ) + ); + owner->push_back( + std::make_shared("l_shipdate") + ); + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::GREATER, + *std::any_cast>( + owner->at(2) + ), + *std::any_cast>(owner->at(1)) + ) + ); + return std::make_unique( + stream, + *std::any_cast>(owner->back()), + rapidsmpf::OwningWrapper(static_cast(owner), [](void* p) { + delete static_cast*>(p); + }) + ); + }(); + return rapidsmpf::streaming::node::read_parquet( + ctx, ch_out, num_producers, options, num_rows_per_chunk, std::move(filter_expr) + ); +} + +rapidsmpf::streaming::Node read_orders( + std::shared_ptr ctx, + std::shared_ptr ch_out, + std::size_t num_producers, + cudf::size_type num_rows_per_chunk, + std::string const& input_directory +) { + auto files = rapidsmpf::ndsh::detail::list_parquet_files( + get_table_path(input_directory, "orders") + ); + auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) + .columns({ + "o_orderkey", // 0 + "o_orderdate", // 1 + "o_shippriority", // 2 + "o_custkey" // 3 + }) + .build(); + auto filter_expr = [&]() -> std::unique_ptr { + auto stream = ctx->br()->stream_pool().get_stream(); + auto owner = new std::vector; + constexpr auto date = cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1995), + cuda::std::chrono::month(3), + cuda::std::chrono::day(15) + ); + auto sys_days = cuda::std::chrono::sys_days(date); + owner->push_back( + std::make_shared>( + sys_days, true, stream + ) + ); + owner->push_back( + std::make_shared( + *std::any_cast< + std::shared_ptr>>( + owner->at(0) + ) + ) + ); + owner->push_back( + std::make_shared("o_orderdate") + ); + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::LESS, + *std::any_cast>( + owner->at(2) + ), + *std::any_cast>(owner->at(1)) + ) + ); + return std::make_unique( + stream, + *std::any_cast>(owner->back()), + rapidsmpf::OwningWrapper(static_cast(owner), [](void* p) { + delete static_cast*>(p); + }) + ); + }(); + return rapidsmpf::streaming::node::read_parquet( + ctx, ch_out, num_producers, options, num_rows_per_chunk, std::move(filter_expr) + ); +} + +// In: [o_orderkey, o_orderdate, o_shippriority, revenue] +[[maybe_unused]] rapidsmpf::streaming::Node chunkwise_groupby_agg( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + std::vector partial_results; + std::uint64_t sequence = 0; + co_await ctx->executor()->schedule(); + ctx->comm()->logger().print("Chunkwise groupby"); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + + auto grouper = cudf::groupby::groupby( + // group by [o_orderkey, o_orderdate, o_shippriority] + table.select({0, 1, 2}), + cudf::null_policy::EXCLUDE, + cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence++, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +[[maybe_unused]] rapidsmpf::streaming::Node final_groupby_agg( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + rapidsmpf::OpID tag +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + // TODO: requires concatenated input stream. + auto msg = co_await ch_in->receive(); + auto next = co_await ch_in->receive(); + ctx->comm()->logger().print("Final groupby"); + RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + std::unique_ptr local_result{nullptr}; + if (!table.is_empty()) { + auto grouper = cudf::groupby::groupby( + table.select({0, 1, 2}), cudf::null_policy::EXCLUDE, cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + local_result = std::make_unique(std::move(result)); + } + if (ctx->comm()->nranks() > 1) { + // Reduce across ranks... + // Need a reduce primitive in rapidsmpf, but let's just use an allgather and + // discard for now. + rapidsmpf::streaming::AllGather gatherer{ctx, tag}; + if (local_result) { + auto pack = + cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); + gatherer.insert( + 0, + {rapidsmpf::PackedData( + std::move(pack.metadata), + ctx->br()->move(std::move(pack.gpu_data), chunk_stream) + )} + ); + } + gatherer.insert_finished(); + auto packed_data = + co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); + if (ctx->comm()->rank() == 0) { + std::vector chunks; + chunks.reserve(packed_data.size()); + std::ranges::transform( + packed_data, std::back_inserter(chunks), [](auto& chunk) { + return std::move(chunk.data); + } + ); + auto global_result = rapidsmpf::unpack_and_concat( + rapidsmpf::unspill_partitions( + std::move(chunks), ctx->br(), true, ctx->statistics() + ), + chunk_stream, + ctx->br(), + ctx->statistics() + ); + if (ctx->comm()->rank() == 0) { + // We will only actually bother to do this on rank zero. + auto result_view = global_result->view(); + auto grouper = cudf::groupby::groupby( + result_view.select({0, 1, 2}), + cudf::null_policy::EXCLUDE, + cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + cudf::groupby::aggregation_request( + result_view.column(3), std::move(aggs) + ) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + global_result.reset(); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + } else { + std::ignore = std::move(packed_data); + } + } else { + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::move(local_result), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +// In: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, l_discount +// Out: o_orderkey, o_orderdate, o_shippriority, revenue = (l_extendedprice - (1 - +// l_discount)) +[[maybe_unused]] rapidsmpf::streaming::Node select_columns_for_groupby( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto sequence_number = msg.sequence_number(); + auto table = chunk.table_view(); + std::vector> result; + result.reserve(4); + + // o_orderkey + result.push_back( + std::make_unique( + table.column(0), chunk_stream, ctx->br()->device_mr() + ) + ); + // o_orderdate + result.push_back( + std::make_unique( + table.column(1), chunk_stream, ctx->br()->device_mr() + ) + ); + // o_shippriority + result.push_back( + std::make_unique( + table.column(2), chunk_stream, ctx->br()->device_mr() + ) + ); + auto extendedprice = table.column(3); + auto discount = table.column(4); + std::string udf = + R"***( +static __device__ void calculate_revenue(double *revenue, double extprice, double discount) { + *revenue = extprice * (1 - discount); +} + )***"; + + // revenue + result.push_back( + cudf::transform( + {extendedprice, discount}, + udf, + cudf::data_type(cudf::type_id::FLOAT64), + false, + std::nullopt, + cudf::null_aware::NO, + chunk_stream, + ctx->br()->device_mr() + ) + ); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence_number, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +// take first 10 rows +[[maybe_unused]] rapidsmpf::streaming::Node top_k( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::vector order, + cudf::size_type k +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + + co_await ctx->executor()->schedule(); + std::vector> partials; + std::vector chunk_streams; + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto const indices = cudf::sorted_order( + chunk.table_view().select(keys), + order, + {}, + chunk.stream(), + ctx->br()->device_mr() + ); + partials.push_back( + cudf::gather( + chunk.table_view(), + cudf::split(indices->view(), {k}, chunk.stream()).front(), + cudf::out_of_bounds_policy::DONT_CHECK, + chunk.stream(), + ctx->br()->device_mr() + ) + ); + chunk_streams.push_back(chunk.stream()); + } + + // TODO: + auto out_stream = chunk_streams.front(); + rapidsmpf::CudaEvent event; + rapidsmpf::cuda_stream_join( + std::ranges::single_view{out_stream}, chunk_streams, &event + ); + std::vector views; + std::ranges::transform(partials, std::back_inserter(views), [](auto& t) { + return t->view(); + }); + auto merged = cudf::merge(views, keys, order, {}, out_stream, ctx->br()->device_mr()); + auto result = + std::make_unique(cudf::slice(merged->view(), {0, 10}, out_stream)); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::move(result), out_stream + ) + ) + ); + co_await ch_out->drain(ctx->executor()); +} + +// In: o_orderkey, o_orderdate, o_shippriority, revenue +[[maybe_unused]] rapidsmpf::streaming::Node write_parquet( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::string output_path +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + co_return; + } + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto sink = cudf::io::sink_info(output_path); + // orderkey, revenue, orderdate, shippriority + auto table = chunk.table_view().select({0, 3, 1, 2}); + auto builder = cudf::io::parquet_writer_options::builder(sink, table); + auto metadata = cudf::io::table_input_metadata(table); + metadata.column_metadata[0].set_name("l_orderkey"); + metadata.column_metadata[1].set_name("revenue"); + metadata.column_metadata[2].set_name("o_orderdate"); + metadata.column_metadata[3].set_name("o_shippriority"); + builder = builder.metadata(metadata); + auto options = builder.build(); + cudf::io::write_parquet(options, chunk.stream()); + ctx->comm()->logger().print( + "Wrote chunk with ", + chunk.table_view().num_rows(), + " rows and ", + chunk.table_view().num_columns(), + " columns to ", + output_path + ); +} + +[[maybe_unused]] rapidsmpf::streaming::Node consume( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + ctx->comm()->logger().print( + "Consumed chunk with ", + chunk.table_view().num_rows(), + " rows and ", + chunk.table_view().num_columns(), + " columns" + ); + } +} +} // namespace + +struct ProgramOptions { + int num_streaming_threads{1}; + int num_iterations{2}; + cudf::size_type num_rows_per_chunk{100'000'000}; + std::optional spill_device_limit{std::nullopt}; + bool use_shuffle_join = false; + std::string output_file; + std::string input_directory; +}; + +ProgramOptions parse_options(int argc, char** argv) { + ProgramOptions options; + + auto print_usage = [&argv]() { + std::cerr + << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " --num-streaming-threads Number of streaming threads (default: 1)\n" + << " --num-iterations Number of iterations (default: 2)\n" + << " --num-rows-per-chunk Number of rows per chunk (default: " + "100000000)\n" + << " --spill-device-limit Fractional spill device limit (default: " + "None)\n" + << " --use-shuffle-join Use shuffle join (default: false)\n" + << " --output-file Output file path (required)\n" + << " --input-directory Input directory path (required)\n" + << " --help Show this help message\n"; + }; + + static struct option long_options[] = { + {"num-streaming-threads", required_argument, nullptr, 1}, + {"num-rows-per-chunk", required_argument, nullptr, 2}, + {"use-shuffle-join", no_argument, nullptr, 3}, + {"output-file", required_argument, nullptr, 4}, + {"input-directory", required_argument, nullptr, 5}, + {"help", no_argument, nullptr, 6}, + {"spill-device-limit", required_argument, nullptr, 7}, + {"num-iterations", required_argument, nullptr, 8}, + {nullptr, 0, nullptr, 0} + }; + + int opt; + int option_index = 0; + + bool saw_output_file = false; + bool saw_input_directory = false; + + while ((opt = getopt_long(argc, argv, "", long_options, &option_index)) != -1) { + switch (opt) { + case 1: + options.num_streaming_threads = std::atoi(optarg); + break; + case 2: + options.num_rows_per_chunk = std::atoi(optarg); + break; + case 3: + options.use_shuffle_join = true; + break; + case 4: + options.output_file = optarg; + saw_output_file = true; + break; + case 5: + options.input_directory = optarg; + saw_input_directory = true; + break; + case 6: + print_usage(); + std::exit(0); + case 7: + options.spill_device_limit = std::stod(optarg); + break; + case 8: + options.num_iterations = std::atoi(optarg); + break; + case '?': + if (optopt == 0 && optind > 1) { + std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; + } + print_usage(); + std::exit(1); + default: + print_usage(); + std::exit(1); + } + } + + // Check if required options were provided + if (!saw_output_file || !saw_input_directory) { + if (!saw_output_file) { + std::cerr << "Error: --output-file is required\n"; + } + if (!saw_input_directory) { + std::cerr << "Error: --input-directory is required\n"; + } + std::cerr << std::endl; + print_usage(); + std::exit(1); + } + + return options; +} + +int main(int argc, char** argv) { + cudaFree(nullptr); + rapidsmpf::mpi::init(&argc, &argv); + MPI_Comm mpi_comm; + RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); + auto cmd_options = parse_options(argc, argv); + auto limit_size = rmm::percent_of_free_device_memory( + static_cast(cmd_options.spill_device_limit.value_or(1) * 100) + ); + rmm::mr::cuda_async_memory_resource mr{}; + // rmm::mr::cuda_memory_resource base{}; + // rmm::mr::pool_memory_resource mr{&base, pool_size}; + auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); + rmm::device_async_resource_ref mr_ref(stats_mr); + rmm::mr::set_current_device_resource(&stats_mr); + rmm::mr::set_current_device_resource_ref(mr_ref); + std::unordered_map + memory_available{}; + if (cmd_options.spill_device_limit.has_value()) { + memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ + &stats_mr, static_cast(limit_size) + }; + } + auto br = std::make_shared( + stats_mr, std::move(memory_available) + ); + auto envvars = rapidsmpf::config::get_environment_variables(); + envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); + auto options = rapidsmpf::config::Options(envvars); + auto stats = std::make_shared(&stats_mr); + { + auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); + auto progress = + std::make_shared(comm->logger(), stats); + auto ctx = + std::make_shared(options, comm, br, stats); + comm->logger().print( + "Executor has ", ctx->executor()->thread_count(), " threads" + ); + comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); + + std::string output_path = cmd_options.output_file; + std::vector timings; + for (int i = 0; i < cmd_options.num_iterations; i++) { + int op_id{0}; + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q3 pipeline"); + + // Input data channels + auto customer = ctx->create_channel(); + auto lineitem = ctx->create_channel(); + auto orders = ctx->create_channel(); + + // join channels + auto customer_x_orders = ctx->create_channel(); + auto customer_x_orders_x_lineitem = ctx->create_channel(); + + // Out: "c_custkey" + nodes.push_back(read_customer( + ctx, + customer, + /* num_tickets */ 2, + cmd_options.num_rows_per_chunk, + cmd_options.input_directory + )); + // Out: o_orderkey, o_orderdate, o_shippriority, o_custkey + nodes.push_back(read_orders( + ctx, + orders, + 6, + cmd_options.num_rows_per_chunk, + cmd_options.input_directory + )); + // join c_custkey = o_custkey + // Out: o_orderkey, o_orderdate, o_shippriority + nodes.push_back( + rapidsmpf::ndsh::inner_join_broadcast( + ctx, + customer, + orders, + customer_x_orders, + {0}, + {3}, + static_cast(10 * i + op_id++), + rapidsmpf::ndsh::KeepKeys::NO + ) + ); + // Out: l_orderkey, l_extendedprice, l_discount + nodes.push_back(read_lineitem( + ctx, + lineitem, + /* num_tickets */ 6, + cmd_options.num_rows_per_chunk, + cmd_options.input_directory + )); + + // join o_orderkey = l_orderkey + // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, + // l_discount + nodes.push_back( + rapidsmpf::ndsh::inner_join_broadcast( + ctx, + customer_x_orders, + lineitem, + customer_x_orders_x_lineitem, + {0}, + {0}, + static_cast(10 * i + op_id++), + rapidsmpf::ndsh::KeepKeys::YES + ) + ); + + auto groupby_input = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back(select_columns_for_groupby( + ctx, customer_x_orders_x_lineitem, groupby_input + )); + auto chunkwise_groupby_output = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back( + chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) + ); + auto concatenated_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, + chunkwise_groupby_output, + concatenated_groupby_output, + rapidsmpf::ndsh::ConcatOrder::DONT_CARE + ) + ); + auto groupby_output = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back(final_groupby_agg( + ctx, + concatenated_groupby_output, + groupby_output, + static_cast(10 * i + op_id++) + )); + auto topk = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back(top_k( + ctx, + groupby_output, + topk, + {3, 1}, + {cudf::order::DESCENDING, cudf::order::ASCENDING}, + 10 + )); + + nodes.push_back(write_parquet(ctx, topk, output_path)); + } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q3 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + comm->logger().print( + "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + ); + comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(stats->report()); + RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + } + if (comm->rank() == 0) { + for (int i = 0; i < cmd_options.num_iterations; i++) { + comm->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + comm->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); + } + } + } + + RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); + RAPIDSMPF_MPI(MPI_Finalize()); + return 0; +} From c7736d3ed903986ff3725282963b323126ee1c43 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Nov 2025 17:51:21 +0000 Subject: [PATCH 02/75] Q1 --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 31 +- cpp/benchmarks/streaming/ndsh/q01.cpp | 903 +++++++++++++++++++ 2 files changed, 930 insertions(+), 4 deletions(-) create mode 100644 cpp/benchmarks/streaming/ndsh/q01.cpp diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 18b618385..0d99dceb6 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -36,9 +36,9 @@ target_link_libraries( $ maybe_asan ) -add_executable(q09 "q09.cpp") +add_executable(q01 "q01.cpp") set_target_properties( - q09 + q01 PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON @@ -46,11 +46,11 @@ set_target_properties( CUDA_STANDARD_REQUIRED ON ) target_compile_options( - q09 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" + q01 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" "$<$:${RAPIDSMPF_CUDA_FLAGS}>" ) target_link_libraries( - q09 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ + q01 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ $ maybe_asan ) add_executable(q03 "q03.cpp") @@ -70,6 +70,23 @@ target_link_libraries( q03 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ $ maybe_asan ) +add_executable(q09 "q09.cpp") +set_target_properties( + q09 + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON +) +target_compile_options( + q09 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" + "$<$:${RAPIDSMPF_CUDA_FLAGS}>" +) +target_link_libraries( + q09 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ + $ maybe_asan +) install( TARGETS rapidsmpfndsh @@ -77,6 +94,12 @@ install( DESTINATION ${lib_dir} EXCLUDE_FROM_ALL ) +install( + TARGETS q01 + COMPONENT benchmarking + DESTINATION bin/benchmarks/librapidsmpf + EXCLUDE_FROM_ALL +) install( TARGETS q03 COMPONENT benchmarking diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp new file mode 100644 index 000000000..1a69bf20a --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -0,0 +1,903 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "concatenate.hpp" +#include "join.hpp" +#include "rapidsmpf/cuda_stream.hpp" +#include "rapidsmpf/owning_wrapper.hpp" +#include "utilities.hpp" + +// select +// l_orderkey, +// sum(l_extendedprice * (1 - l_discount)) as revenue, +// o_orderdate, +// o_shippriority +// from +// customer, +// orders, +// lineitem +// where +// c_mktsegment = 'BUILDING' +// and c_custkey = o_custkey +// and l_orderkey = o_orderkey +// and o_orderdate < '1995-03-15' +// and l_shipdate > '1995-03-15' +// group by +// l_orderkey, +// o_orderdate, +// o_shippriority +// order by +// revenue desc, +// o_orderdate +// limit 10 + +namespace { + +std::string get_table_path( + std::string const& input_directory, std::string const& table_name +) { + auto dir = input_directory.empty() ? "." : input_directory; + auto file_path = dir + "/" + table_name + ".parquet"; + + if (std::filesystem::exists(file_path)) { + return file_path; + } + + return dir + "/" + table_name + "/"; +} + +[[maybe_unused]] rapidsmpf::streaming::Node read_lineitem( + std::shared_ptr ctx, + std::shared_ptr ch_out, + std::size_t num_producers, + cudf::size_type num_rows_per_chunk, + std::string const& input_directory +) { + auto files = rapidsmpf::ndsh::detail::list_parquet_files( + get_table_path(input_directory, "lineitem") + ); + auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) + .columns({ + "l_returnflag", // 0 + "l_linestatus", // 1 + "l_quantity", // 2 + "l_extendedprice", // 3 + "l_discount", // 4 + "l_tax" // 5 + }) + .build(); + auto filter_expr = [&]() -> std::unique_ptr { + auto stream = ctx->br()->stream_pool().get_stream(); + auto owner = new std::vector; + constexpr auto date = cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1998), + cuda::std::chrono::month(9), + cuda::std::chrono::day(2) + ); + auto sys_days = cuda::std::chrono::sys_days(date); + owner->push_back( + std::make_shared>( + sys_days, true, stream + ) + ); + owner->push_back( + std::make_shared( + *std::any_cast< + std::shared_ptr>>( + owner->at(0) + ) + ) + ); + owner->push_back( + std::make_shared("l_shipdate") + ); + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::LESS_EQUAL, + *std::any_cast>( + owner->at(2) + ), + *std::any_cast>(owner->at(1)) + ) + ); + return std::make_unique( + stream, + *std::any_cast>(owner->back()), + rapidsmpf::OwningWrapper(static_cast(owner), [](void* p) { + delete static_cast*>(p); + }) + ); + }(); + return rapidsmpf::streaming::node::read_parquet( + ctx, ch_out, num_producers, options, num_rows_per_chunk, std::move(filter_expr) + ); +} + +// l_returnflag, l_linestatus, l_quantity, l_extendedprice, +// disc_price = (l_extendedprice * (1 - l_discount)), +// charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)) +// l_discount +[[maybe_unused]] rapidsmpf::streaming::Node chunkwise_groupby_agg( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + std::vector partial_results; + std::uint64_t sequence = 0; + co_await ctx->executor()->schedule(); + ctx->comm()->logger().print("Chunkwise groupby"); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + + auto grouper = cudf::groupby::groupby( + // group by [l_returnflag, l_linestatus] + table.select({0, 1}), + cudf::null_policy::EXCLUDE, + cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_quantity) + cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_extendedprice) + cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(disc_price) + cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(charge) + cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_discount) + cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) + ); + aggs.push_back( + cudf::make_count_aggregation( + cudf::null_policy::INCLUDE + ) + ); + requests.push_back( + // count(*) + cudf::groupby::aggregation_request(table.column(0), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence++, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +[[maybe_unused]] rapidsmpf::streaming::Node final_groupby_agg( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + rapidsmpf::OpID tag +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + // TODO: requires concatenated input stream. + auto msg = co_await ch_in->receive(); + auto next = co_await ch_in->receive(); + ctx->comm()->logger().print("Final groupby"); + RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + std::unique_ptr local_result{nullptr}; + if (!table.is_empty()) { + auto grouper = cudf::groupby::groupby( + table.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_quantity) + cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_extendedprice) + cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(disc_price) + cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(charge) + cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_discount) + cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(count(*)) + cudf::groupby::aggregation_request(table.column(7), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + local_result = std::make_unique(std::move(result)); + } + if (ctx->comm()->nranks() > 1) { + // Reduce across ranks... + // Need a reduce primitive in rapidsmpf, but let's just use an allgather and + // discard for now. + rapidsmpf::streaming::AllGather gatherer{ctx, tag}; + if (local_result) { + auto pack = + cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); + gatherer.insert( + 0, + {rapidsmpf::PackedData( + std::move(pack.metadata), + ctx->br()->move(std::move(pack.gpu_data), chunk_stream) + )} + ); + } + gatherer.insert_finished(); + auto packed_data = + co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); + if (ctx->comm()->rank() == 0) { + std::vector chunks; + chunks.reserve(packed_data.size()); + std::ranges::transform( + packed_data, std::back_inserter(chunks), [](auto& chunk) { + return std::move(chunk.data); + } + ); + auto global_result = rapidsmpf::unpack_and_concat( + rapidsmpf::unspill_partitions( + std::move(chunks), ctx->br(), true, ctx->statistics() + ), + chunk_stream, + ctx->br(), + ctx->statistics() + ); + auto table = global_result->view(); + auto grouper = cudf::groupby::groupby( + table.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_quantity) + cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_extendedprice) + cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(disc_price) + cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(charge) + cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_discount) + cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(count(*)) + cudf::groupby::aggregation_request(table.column(7), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + auto count = std::move(result.back()); + result.pop_back(); + auto discount = std::move(result.back()); + result.pop_back(); + for (std::size_t i = 2; i < 4; i++) { + result.push_back( + cudf::binary_operation( + result[i]->view(), + count->view(), + cudf::binary_operator::TRUE_DIV, + cudf::data_type(cudf::type_id::FLOAT64), + chunk_stream, + ctx->br()->device_mr() + ) + ); + } + result.push_back( + cudf::binary_operation( + discount->view(), + count->view(), + cudf::binary_operator::TRUE_DIV, + cudf::data_type(cudf::type_id::FLOAT64), + chunk_stream, + ctx->br()->device_mr() + ) + ); + + result.push_back(std::move(count)); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } else { + std::ignore = std::move(packed_data); + } + } else { + auto result = local_result->release(); + auto count = std::move(result.back()); + result.pop_back(); + auto discount = std::move(result.back()); + result.pop_back(); + for (std::size_t i = 2; i < 4; i++) { + result.push_back( + cudf::binary_operation( + result[i]->view(), + count->view(), + cudf::binary_operator::TRUE_DIV, + cudf::data_type(cudf::type_id::FLOAT64), + chunk_stream, + ctx->br()->device_mr() + ) + ); + } + result.push_back( + cudf::binary_operation( + discount->view(), + count->view(), + cudf::binary_operator::TRUE_DIV, + cudf::data_type(cudf::type_id::FLOAT64), + chunk_stream, + ctx->br()->device_mr() + ) + ); + result.push_back(std::move(count)); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +// In: l_returnflag, l_linestatus, l_quantity, l_extendedprice, +// l_discount, l_tax +// Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, +// disc_price = (l_extendedprice * (1 - l_discount)), +// charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)), +// l_discount +[[maybe_unused]] rapidsmpf::streaming::Node select_columns_for_groupby( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto sequence_number = msg.sequence_number(); + auto table = chunk.table_view(); + // l_returnflag, l_linestatus, l_quantity, l_extendedprice + auto result = + cudf::table(table.select({0, 1, 2, 3}), chunk_stream, ctx->br()->device_mr()) + .release(); + result.reserve(7); + auto extendedprice = table.column(3); + auto discount = table.column(4); + auto tax = table.column(5); + std::string udf_disc_price = + R"***( +static __device__ void calculate_disc_price(double *disc_price, double extprice, double discount) { + *disc_price = extprice * (1 - discount); +} + )***"; + std::string udf_charge = + R"***( +static __device__ void calculate_charge(double *charge, double discprice, double tax) { + *charge = discprice * (1 + tax); +} + )***"; + + // disc_price + result.push_back( + cudf::transform( + {extendedprice, discount}, + udf_disc_price, + cudf::data_type(cudf::type_id::FLOAT64), + false, + std::nullopt, + cudf::null_aware::NO, + chunk_stream, + ctx->br()->device_mr() + ) + ); + // charge + result.push_back( + cudf::transform( + {result.back()->view(), tax}, + udf_charge, + cudf::data_type(cudf::type_id::FLOAT64), + false, + std::nullopt, + cudf::null_aware::NO, + chunk_stream, + ctx->br()->device_mr() + ) + ); + // l_discount + result.push_back( + std::make_unique(discount, chunk_stream, ctx->br()->device_mr()) + ); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence_number, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +[[maybe_unused]] rapidsmpf::streaming::Node sort_by( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + // We know we only have a single chunk from the groupby + if (msg.empty()) { + co_return; + } + ctx->comm()->logger().print("Sortby"); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto table = chunk.table_view(); + auto result = rapidsmpf::streaming::to_message( + 0, + std::make_unique( + cudf::sort_by_key( + table, + table.select({0, 1}), + {cudf::order::ASCENDING, cudf::order::ASCENDING}, + {cudf::null_order::BEFORE, cudf::null_order::BEFORE}, + chunk.stream(), + ctx->br()->device_mr() + ), + chunk.stream() + ) + ); + co_await ch_out->send(std::move(result)); + co_await ch_out->drain(ctx->executor()); +} + +// In: o_orderkey, o_orderdate, o_shippriority, revenue +[[maybe_unused]] rapidsmpf::streaming::Node write_parquet( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::string output_path +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + co_return; + } + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto sink = cudf::io::sink_info(output_path); + auto table = chunk.table_view(); + auto builder = cudf::io::parquet_writer_options::builder(sink, table); + auto metadata = cudf::io::table_input_metadata(table); + metadata.column_metadata[0].set_name("l_returnflag"); + metadata.column_metadata[1].set_name("l_linestatus"); + metadata.column_metadata[2].set_name("sum_qty"); + metadata.column_metadata[3].set_name("sum_base_price"); + metadata.column_metadata[4].set_name("sum_disc_price"); + metadata.column_metadata[5].set_name("sum_charge"); + metadata.column_metadata[6].set_name("avg_qty"); + metadata.column_metadata[7].set_name("avg_price"); + metadata.column_metadata[8].set_name("avg_disc"); + metadata.column_metadata[9].set_name("count_order"); + builder = builder.metadata(metadata); + auto options = builder.build(); + cudf::io::write_parquet(options, chunk.stream()); + ctx->comm()->logger().print( + "Wrote chunk with ", + chunk.table_view().num_rows(), + " rows and ", + chunk.table_view().num_columns(), + " columns to ", + output_path + ); +} + +[[maybe_unused]] rapidsmpf::streaming::Node consume( + [[maybe_unused]] std::shared_ptr ctx, + std::shared_ptr ch_in +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + ctx->comm()->logger().print( + "Consumed chunk with ", + chunk.table_view().num_rows(), + " rows and ", + chunk.table_view().num_columns(), + " columns" + ); + } +} +} // namespace + +struct ProgramOptions { + int num_streaming_threads{1}; + int num_iterations{2}; + cudf::size_type num_rows_per_chunk{100'000'000}; + std::optional spill_device_limit{std::nullopt}; + bool use_shuffle_join = false; + std::string output_file; + std::string input_directory; +}; + +ProgramOptions parse_options(int argc, char** argv) { + ProgramOptions options; + + auto print_usage = [&argv]() { + std::cerr + << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " --num-streaming-threads Number of streaming threads (default: 1)\n" + << " --num-iterations Number of iterations (default: 2)\n" + << " --num-rows-per-chunk Number of rows per chunk (default: " + "100000000)\n" + << " --spill-device-limit Fractional spill device limit (default: " + "None)\n" + << " --use-shuffle-join Use shuffle join (default: false)\n" + << " --output-file Output file path (required)\n" + << " --input-directory Input directory path (required)\n" + << " --help Show this help message\n"; + }; + + static struct option long_options[] = { + {"num-streaming-threads", required_argument, nullptr, 1}, + {"num-rows-per-chunk", required_argument, nullptr, 2}, + {"use-shuffle-join", no_argument, nullptr, 3}, + {"output-file", required_argument, nullptr, 4}, + {"input-directory", required_argument, nullptr, 5}, + {"help", no_argument, nullptr, 6}, + {"spill-device-limit", required_argument, nullptr, 7}, + {"num-iterations", required_argument, nullptr, 8}, + {nullptr, 0, nullptr, 0} + }; + + int opt; + int option_index = 0; + + bool saw_output_file = false; + bool saw_input_directory = false; + + while ((opt = getopt_long(argc, argv, "", long_options, &option_index)) != -1) { + switch (opt) { + case 1: + options.num_streaming_threads = std::atoi(optarg); + break; + case 2: + options.num_rows_per_chunk = std::atoi(optarg); + break; + case 3: + options.use_shuffle_join = true; + break; + case 4: + options.output_file = optarg; + saw_output_file = true; + break; + case 5: + options.input_directory = optarg; + saw_input_directory = true; + break; + case 6: + print_usage(); + std::exit(0); + case 7: + options.spill_device_limit = std::stod(optarg); + break; + case 8: + options.num_iterations = std::atoi(optarg); + break; + case '?': + if (optopt == 0 && optind > 1) { + std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; + } + print_usage(); + std::exit(1); + default: + print_usage(); + std::exit(1); + } + } + + // Check if required options were provided + if (!saw_output_file || !saw_input_directory) { + if (!saw_output_file) { + std::cerr << "Error: --output-file is required\n"; + } + if (!saw_input_directory) { + std::cerr << "Error: --input-directory is required\n"; + } + std::cerr << std::endl; + print_usage(); + std::exit(1); + } + + return options; +} + +int main(int argc, char** argv) { + cudaFree(nullptr); + rapidsmpf::mpi::init(&argc, &argv); + MPI_Comm mpi_comm; + RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); + auto cmd_options = parse_options(argc, argv); + auto limit_size = rmm::percent_of_free_device_memory( + static_cast(cmd_options.spill_device_limit.value_or(1) * 100) + ); + rmm::mr::cuda_async_memory_resource mr{}; + // rmm::mr::cuda_memory_resource base{}; + // rmm::mr::pool_memory_resource mr{&base, pool_size}; + auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); + rmm::device_async_resource_ref mr_ref(stats_mr); + rmm::mr::set_current_device_resource(&stats_mr); + rmm::mr::set_current_device_resource_ref(mr_ref); + std::unordered_map + memory_available{}; + if (cmd_options.spill_device_limit.has_value()) { + memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ + &stats_mr, static_cast(limit_size) + }; + } + auto br = std::make_shared( + stats_mr, std::move(memory_available) + ); + auto envvars = rapidsmpf::config::get_environment_variables(); + envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); + auto options = rapidsmpf::config::Options(envvars); + auto stats = std::make_shared(&stats_mr); + { + auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); + auto progress = + std::make_shared(comm->logger(), stats); + auto ctx = + std::make_shared(options, comm, br, stats); + comm->logger().print( + "Executor has ", ctx->executor()->thread_count(), " threads" + ); + comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); + + std::string output_path = cmd_options.output_file; + std::vector timings; + [[maybe_unused]] int op_id = 0; + for (int i = 0; i < cmd_options.num_iterations; i++) { + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q1 pipeline"); + + // Input data channels + auto lineitem = ctx->create_channel(); + // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, + // l_discount, l_tax + nodes.push_back(read_lineitem( + ctx, + lineitem, + /* num_tickets */ 8, + cmd_options.num_rows_per_chunk, + cmd_options.input_directory + )); + + auto groupby_input = ctx->create_channel(); + // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, + // disc_price = (l_extendedprice * (1 - l_discount)), + // charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)) + // l_discount + nodes.push_back(select_columns_for_groupby(ctx, lineitem, groupby_input)); + auto chunkwise_groupby = ctx->create_channel(); + nodes.push_back( + chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby) + ); + auto final_groupby_input = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, chunkwise_groupby, final_groupby_input + ) + ); + auto groupby_output = ctx->create_channel(); + nodes.push_back(final_groupby_agg( + ctx, + final_groupby_input, + groupby_output, + static_cast(10 * i + op_id++) + )); + auto sorted = ctx->create_channel(); + nodes.push_back(sort_by(ctx, groupby_output, sorted)); + nodes.push_back(write_parquet(ctx, sorted, output_path)); + } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q3 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + comm->logger().print( + "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + ); + comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(stats->report()); + RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + } + if (comm->rank() == 0) { + for (int i = 0; i < cmd_options.num_iterations; i++) { + comm->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + comm->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); + } + } + } + + RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); + RAPIDSMPF_MPI(MPI_Finalize()); + return 0; +} From b4ef3fe0304c3af8b751245cc9dc495741d048a3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Nov 2025 17:55:55 +0000 Subject: [PATCH 03/75] Parallel grouping --- cpp/benchmarks/streaming/ndsh/q01.cpp | 146 ++++++++++++++------------ 1 file changed, 76 insertions(+), 70 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index 1a69bf20a..e443a3cbe 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -66,6 +66,7 @@ #include "join.hpp" #include "rapidsmpf/cuda_stream.hpp" #include "rapidsmpf/owning_wrapper.hpp" +#include "rapidsmpf/streaming/core/coro_utils.hpp" #include "utilities.hpp" // select @@ -186,78 +187,83 @@ std::string get_table_path( rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; std::vector partial_results; std::uint64_t sequence = 0; - co_await ctx->executor()->schedule(); ctx->comm()->logger().print("Chunkwise groupby"); - while (true) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - break; - } - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); + auto grouper = [&]() -> coro::task { + while (true) { + auto msg = co_await ch_in->receive(); + co_await ctx->executor()->schedule(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); - auto grouper = cudf::groupby::groupby( - // group by [l_returnflag, l_linestatus] - table.select({0, 1}), - cudf::null_policy::EXCLUDE, - cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_quantity) - cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_extendedprice) - cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(disc_price) - cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(charge) - cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_discount) - cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) - ); - aggs.push_back( - cudf::make_count_aggregation( - cudf::null_policy::INCLUDE - ) - ); - requests.push_back( - // count(*) - cudf::groupby::aggregation_request(table.column(0), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - sequence++, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream + auto grouper = cudf::groupby::groupby( + // group by [l_returnflag, l_linestatus] + table.select({0, 1}), + cudf::null_policy::EXCLUDE, + cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_quantity) + cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_extendedprice) + cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(disc_price) + cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(charge) + cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) + ); + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + // sum(l_discount) + cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) + ); + aggs.push_back( + cudf::make_count_aggregation( + cudf::null_policy::INCLUDE ) - ) - ); - } + ); + requests.push_back( + // count(*) + cudf::groupby::aggregation_request(table.column(0), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence++, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + }; + rapidsmpf::streaming::coro_results( + co_await coro::when_all(grouper(), grouper(), grouper(), grouper()) + ); co_await ch_out->drain(ctx->executor()); } @@ -832,7 +838,7 @@ int main(int argc, char** argv) { nodes.push_back(read_lineitem( ctx, lineitem, - /* num_tickets */ 8, + /* num_tickets */ 4, cmd_options.num_rows_per_chunk, cmd_options.input_directory )); From 2e1fdf2ed3cda4fa2ad90d99bb55c8e8f2ff8989 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 15:27:43 +0000 Subject: [PATCH 04/75] Dup the user's communicator when creating our MPI comm wrapper --- cpp/include/rapidsmpf/communicator/mpi.hpp | 2 +- cpp/src/communicator/mpi.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/include/rapidsmpf/communicator/mpi.hpp b/cpp/include/rapidsmpf/communicator/mpi.hpp index 39d19f9e3..7cdfb02c3 100644 --- a/cpp/include/rapidsmpf/communicator/mpi.hpp +++ b/cpp/include/rapidsmpf/communicator/mpi.hpp @@ -121,7 +121,7 @@ class MPI final : public Communicator { */ MPI(MPI_Comm comm, config::Options options); - ~MPI() noexcept override = default; + ~MPI() noexcept override; /** * @copydoc Communicator::rank diff --git a/cpp/src/communicator/mpi.cpp b/cpp/src/communicator/mpi.cpp index 767b9db6d..9e02f827f 100644 --- a/cpp/src/communicator/mpi.cpp +++ b/cpp/src/communicator/mpi.cpp @@ -96,10 +96,10 @@ void check_mpi_thread_support() { } } // namespace -MPI::MPI(MPI_Comm comm, config::Options options) - : comm_{comm}, logger_{this, std::move(options)} { +MPI::MPI(MPI_Comm comm, config::Options options) : logger_{this, std::move(options)} { int rank; int nranks; + MPI_Comm_dup(comm, &comm_); RAPIDSMPF_MPI(MPI_Comm_rank(comm_, &rank)); RAPIDSMPF_MPI(MPI_Comm_size(comm_, &nranks)); rank_ = rank; @@ -107,6 +107,10 @@ MPI::MPI(MPI_Comm comm, config::Options options) check_mpi_thread_support(); } +MPI::~MPI() noexcept { + RAPIDSMPF_MPI(MPI_Comm_free(&comm_)); +} + std::unique_ptr MPI::send( std::unique_ptr> msg, Rank rank, Tag tag ) { From 32dcff72693d4c55e34e753c6c3eedd6795d7ba8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 16:30:05 +0000 Subject: [PATCH 05/75] Context creation and options parsing into utils --- cpp/benchmarks/streaming/ndsh/utils.cpp | 296 +++++++++++++++++++++++- cpp/benchmarks/streaming/ndsh/utils.hpp | 81 ++++++- 2 files changed, 371 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 3f1f177b4..1d67ff82d 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -5,23 +5,35 @@ #include "utils.hpp" +#include #include #include #include #include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include #include #include namespace rapidsmpf::ndsh { namespace detail { std::vector list_parquet_files(std::string const& root_path) { - // Files are named `ANYTHING_somenumber.ANYTHING.parquet` Should be sorted in - // ascending order by their numerical part. root_path is the path to the directory - // containing the files. - auto root_entry = std::filesystem::directory_entry(std::filesystem::path(root_path)); RAPIDSMPF_EXPECTS( root_entry.exists() @@ -59,4 +71,278 @@ streaming::TableChunk to_device( ); return chunk.make_available(reservation); } + +std::shared_ptr create_context( + ProgramOptions& arguments, RmmResourceAdaptor* mr +) { + rmm::mr::set_current_device_resource(mr); + rmm::mr::set_current_device_resource_ref(mr); + std::unordered_map memory_available{}; + if (arguments.spill_device_limit.has_value()) { + auto limit_size = + rmm::available_device_memory().second + * static_cast(arguments.spill_device_limit.value() * 100); + + memory_available[MemoryType::DEVICE] = + LimitAvailableMemory{mr, static_cast(limit_size)}; + } + auto statistics = std::make_shared(mr); + + auto br = std::make_shared( + mr, + std::move(memory_available), + arguments.periodic_spill, + std::make_shared( + arguments.num_streams, rmm::cuda_stream::flags::non_blocking + ), + statistics + ); + auto environment = config::get_environment_variables(); + environment["NUM_STREAMING_THREADS"] = + std::to_string(arguments.num_streaming_threads); + auto options = config::Options(environment); + std::shared_ptr comm; + switch (arguments.comm_type) { + case CommType::MPI: + RAPIDSMPF_EXPECTS( + !bootstrap::is_running_with_rrun(), "Can't use MPI communicator with rrun" + ); + mpi::init(nullptr, nullptr); + + comm = std::make_shared(MPI_COMM_WORLD, options); + break; + case CommType::SINGLE: + comm = std::make_shared(options); + break; + case CommType::UCXX: + if (bootstrap::is_running_with_rrun()) { + comm = bootstrap::create_ucxx_comm(bootstrap::Backend::AUTO, options); + } else { + mpi::init(nullptr, nullptr); + comm = ucxx::init_using_mpi(MPI_COMM_WORLD, options); + } + break; + default: + RAPIDSMPF_EXPECTS(false, "Unknown communicator type"); + } + auto ctx = std::make_shared(options, comm, br, statistics); + if (comm->rank() == 0) { + comm->logger().print( + "Execution context on ", + comm->nranks(), + " ranks has ", + ctx->executor()->thread_count(), + " threads" + ); + } + return ctx; +} + +ProgramOptions parse_arguments(int argc, char** argv) { + ProgramOptions options; + + static constexpr std::array(CommType::MAX)> + comm_names{"single", "mpi", "ucxx"}; + + auto print_usage = [&argv, &options]() { + std::cerr + << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " --num-streaming-threads Number of streaming threads (default: " + << options.num_streaming_threads << ")\n" + << " --num-iterations Number of iterations (default: " + << options.num_iterations << ")\n" + << " --num-streams Number of streams in stream pool " + "(default: " + << options.num_streams << ")\n" + << " --num-rows-per-chunk Number of rows per chunk (default: " + << options.num_rows_per_chunk << ")\n" + << " --spill-device-limit Fractional spill device limit as " + "fraction " + "of total device memory (default: " + << (options.spill_device_limit.has_value() + ? std::to_string(options.spill_device_limit.value()) + : "None") + << ")\n" + << " --periodic-spill Duration in milliseconds between periodic " + "spilling checks (default: " + << (options.periodic_spill.has_value() + ? std::to_string(options.periodic_spill.value().count()) + : "None") + << ")\n" + << " --comm-type Communicator type: single, mpi, ucxx " + "(default: " + << comm_names[static_cast(options.comm_type)] << ")\n" + << " --use-shuffle-join Use shuffle join (default: " + << (options.use_shuffle_join ? "true" : "false") << ")\n" + << " --output-file Output file path (required)\n" + << " --input-directory Input directory path (required)\n" + << " --help Show this help message\n"; + }; + + // NOLINTBEGIN(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,modernize-use-designated-initializers) + static struct option long_options[] = { + {"num-streaming-threads", required_argument, nullptr, 1}, + {"num-rows-per-chunk", required_argument, nullptr, 2}, + {"use-shuffle-join", no_argument, nullptr, 3}, + {"output-file", required_argument, nullptr, 4}, + {"input-directory", required_argument, nullptr, 5}, + {"help", no_argument, nullptr, 6}, + {"spill-device-limit", required_argument, nullptr, 7}, + {"num-iterations", required_argument, nullptr, 8}, + {"num-streams", required_argument, nullptr, 9}, + {"comm-type", required_argument, nullptr, 10}, + {"periodic-spill", required_argument, nullptr, 11}, + {nullptr, 0, nullptr, 0} + }; + // NOLINTEND(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,modernize-use-designated-initializers) + + int opt; + int option_index = 0; + + bool saw_output_file = false; + bool saw_input_directory = false; + + while ((opt = getopt_long(argc, argv, "", long_options, &option_index)) != -1) { + switch (opt) { + case 1: + { + char* endptr; + long val = std::strtol(optarg, &endptr, 10); + if (*endptr != '\0' || val <= 0) { + std::cerr << "Error: Invalid value for --num-streaming-threads: " + << optarg << "\n\n"; + print_usage(); + std::exit(1); + } + options.num_streaming_threads = static_cast(val); + break; + } + case 2: + { + char* endptr; + long val = std::strtol(optarg, &endptr, 10); + if (*endptr != '\0' || val <= 0) { + std::cerr << "Error: Invalid value for --num-rows-per-chunk: " + << optarg << "\n\n"; + print_usage(); + std::exit(1); + } + options.num_rows_per_chunk = static_cast(val); + break; + } + case 3: + options.use_shuffle_join = true; + break; + case 4: + options.output_file = optarg; + saw_output_file = true; + break; + case 5: + options.input_directory = optarg; + saw_input_directory = true; + break; + case 6: + print_usage(); + std::exit(0); + case 7: + { + char* endptr; + double val = std::strtod(optarg, &endptr); + if (*endptr != '\0' || val < 0.0 || val > 1.0) { + std::cerr << "Error: Invalid value for --spill-device-limit: " + << optarg << " (must be between 0.0 and 1.0)\n\n"; + print_usage(); + std::exit(1); + } + options.spill_device_limit = val; + break; + } + case 8: + { + char* endptr; + long val = std::strtol(optarg, &endptr, 10); + if (*endptr != '\0' || val <= 0) { + std::cerr << "Error: Invalid value for --num-iterations: " << optarg + << "\n\n"; + print_usage(); + std::exit(1); + } + options.num_iterations = static_cast(val); + break; + } + case 9: + { + char* endptr; + long val = std::strtol(optarg, &endptr, 10); + if (*endptr != '\0' || val <= 0) { + std::cerr << "Error: Invalid value for --num-streams: " << optarg + << "\n\n"; + print_usage(); + std::exit(1); + } + options.num_streams = static_cast(val); + break; + } + case 10: + { + std::string comm_type = optarg; + if (comm_type == "mpi") { + options.comm_type = CommType::MPI; + } else if (comm_type == "single") { + options.comm_type = CommType::SINGLE; + } else if (comm_type == "ucxx") { + options.comm_type = CommType::UCXX; + } else { + std::cerr << "Error: Invalid value for --comm-type: " << optarg + << " (must be one of " << comm_names[0]; + for (std::size_t i = 1; i < comm_names.size(); ++i) { + std::cerr << ", " << comm_names[i]; + } + std::cerr << ")\n\n"; + print_usage(); + std::exit(1); + } + break; + } + case 11: + { + char* endptr; + long val = std::strtol(optarg, &endptr, 10); + if (*endptr != '\0' || val <= 0) { + std::cerr << "Error: Invalid value for --periodic-spill: " << optarg + << "\n\n"; + print_usage(); + std::exit(1); + } + options.periodic_spill = std::chrono::milliseconds(val); + break; + } + case '?': + if (optopt == 0 && optind > 1) { + std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; + } + print_usage(); + std::exit(1); + default: + print_usage(); + std::exit(1); + } + } + + // Check if required options were provided + if (!saw_output_file || !saw_input_directory) { + if (!saw_output_file) { + std::cerr << "Error: --output-file is required\n"; + } + if (!saw_input_directory) { + std::cerr << "Error: --input-directory is required\n"; + } + std::cerr << std::endl; + print_usage(); + std::exit(1); + } + + return options; +} } // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index 9588a33b1..4172735c4 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -4,6 +4,7 @@ */ #pragma once +#include #include #include #include @@ -13,12 +14,90 @@ namespace rapidsmpf::ndsh { namespace detail { + +/** + * @brief List all parquet files in a given path. + * + * @param root_path The path to look in. + * + * @return If `root_path` names a regular file that ends with `.parquet` then a singleton + * vector of just that file. If `root_path` is a directory, then a vector containing all + * regular files in that directory whose name ends with `.parquet`, in the order they are + * listed. + * + * @throws std::runtime_error if the `root_path` doesn't name a regular file or a + * directory. Or if it does name a regular file, but that file doesn't end in `.parquet`. + */ [[nodiscard]] std::vector list_parquet_files(std::string const& root_path); -} +} // namespace detail +/** + * @brief Ensure a `TableChunk` is on device. + * + * @param ctx Streaming context + * @param chunk Chunk to move from device, is left in a moved-from state + * @param allow_overbooking Whether reserving memory is allowed to overbook + * + * @return New `TableChunk` on device + * @throws std::overflow_error if overbooking is not allowed and not enough memory is + * available to reserve. + */ [[nodiscard]] streaming::TableChunk to_device( std::shared_ptr ctx, streaming::TableChunk&& chunk, bool allow_overbooking = false ); + +///< @brief Communicator type to use +enum class CommType : std::uint8_t { + SINGLE, ///< Single process communicator + MPI, ///< MPI backed communicator + UCXX, ///< UCXX backed communicator + MAX, ///< Max value +}; + +///< @brief Configuration options for the query +struct ProgramOptions { + int num_streaming_threads{1}; ///< Number of streaming threads to use + int num_iterations{2}; ///< Number of iterations of query to run + int num_streams{16}; ///< Number of streams in stream pool + CommType comm_type{CommType::UCXX}; ///< Type of communicator to create + std::optional + periodic_spill; ///< Duration between background periodic spilling checks + cudf::size_type num_rows_per_chunk{ + 100'000'000 + }; ///< Number of rows to produce per chunk read + std::optional spill_device_limit{ + std::nullopt + }; ///< Optional fractional spill limit + bool use_shuffle_join = false; ///< Use shuffle join for "big" joins? + std::string output_file; ///< File to write output to + std::string input_directory; ///< Directory containing input files. +}; + +/** + * @brief Parse commandline arguments + * + * @param argc Number of arguments + * @param argv Arguments + * + * @return `ProgramOptions` struct with parsed arguments. + */ +ProgramOptions parse_arguments(int argc, char** argv); + +/** + * @brief Create a streaming execution context for a query. + * + * @param arguments Arguments to configure the context + * @param mr Pointer to memory resource to use for all allocations + * @warning The memory resource _must_ be kept alive until the final usage of the returned + * Context is complete. + * + * @return Shared pointer to new streaming context. + */ +std::shared_ptr create_context( + ProgramOptions& arguments, RmmResourceAdaptor* mr +); + + } // namespace rapidsmpf::ndsh From e24cfc3fcbfaeeae7e5f55d4a07e8a948528bad9 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 16:54:51 +0000 Subject: [PATCH 06/75] Use refactored context/argparse in q03 --- cpp/benchmarks/streaming/ndsh/q03.cpp | 522 ++++++++------------------ 1 file changed, 159 insertions(+), 363 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 8059f3c30..37671f309 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -8,53 +8,36 @@ #include #include #include -#include #include #include #include -#include #include #include #include -#include -#include #include -#include #include #include #include #include -#include #include -#include #include -#include -#include #include #include #include #include -#include #include #include -#include -#include -#include -#include -#include #include #include -#include -#include -#include -#include #include +#include #include +#include #include -#include +#include #include #include #include @@ -64,33 +47,7 @@ #include "concatenate.hpp" #include "join.hpp" -#include "rapidsmpf/cuda_stream.hpp" -#include "rapidsmpf/owning_wrapper.hpp" -#include "utilities.hpp" - -// select -// l_orderkey, -// sum(l_extendedprice * (1 - l_discount)) as revenue, -// o_orderdate, -// o_shippriority -// from -// customer, -// orders, -// lineitem -// where -// c_mktsegment = 'BUILDING' -// and c_custkey = o_custkey -// and l_orderkey = o_orderkey -// and o_orderdate < '1995-03-15' -// and l_shipdate > '1995-03-15' -// group by -// l_orderkey, -// o_orderdate, -// o_shippriority -// order by -// revenue desc, -// o_orderdate -// limit 10 +#include "utils.hpp" namespace { @@ -395,53 +352,42 @@ rapidsmpf::streaming::Node read_orders( auto packed_data = co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); if (ctx->comm()->rank() == 0) { - std::vector chunks; - chunks.reserve(packed_data.size()); - std::ranges::transform( - packed_data, std::back_inserter(chunks), [](auto& chunk) { - return std::move(chunk.data); - } - ); auto global_result = rapidsmpf::unpack_and_concat( rapidsmpf::unspill_partitions( - std::move(chunks), ctx->br(), true, ctx->statistics() + std::move(packed_data), ctx->br(), true, ctx->statistics() ), chunk_stream, ctx->br(), ctx->statistics() ); - if (ctx->comm()->rank() == 0) { - // We will only actually bother to do this on rank zero. - auto result_view = global_result->view(); - auto grouper = cudf::groupby::groupby( - result_view.select({0, 1, 2}), - cudf::null_policy::EXCLUDE, - cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request( - result_view.column(3), std::move(aggs) - ) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - global_result.reset(); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); + // We will only actually bother to do this on rank zero. + auto result_view = global_result->view(); + auto grouper = cudf::groupby::groupby( + result_view.select({0, 1, 2}), + cudf::null_policy::EXCLUDE, + cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + cudf::groupby::aggregation_request(result_view.column(3), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + global_result.reset(); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); } + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); } else { std::ignore = std::move(packed_data); } @@ -661,297 +607,147 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl } } // namespace -struct ProgramOptions { - int num_streaming_threads{1}; - int num_iterations{2}; - cudf::size_type num_rows_per_chunk{100'000'000}; - std::optional spill_device_limit{std::nullopt}; - bool use_shuffle_join = false; - std::string output_file; - std::string input_directory; -}; - -ProgramOptions parse_options(int argc, char** argv) { - ProgramOptions options; - - auto print_usage = [&argv]() { - std::cerr - << "Usage: " << argv[0] << " [options]\n" - << "Options:\n" - << " --num-streaming-threads Number of streaming threads (default: 1)\n" - << " --num-iterations Number of iterations (default: 2)\n" - << " --num-rows-per-chunk Number of rows per chunk (default: " - "100000000)\n" - << " --spill-device-limit Fractional spill device limit (default: " - "None)\n" - << " --use-shuffle-join Use shuffle join (default: false)\n" - << " --output-file Output file path (required)\n" - << " --input-directory Input directory path (required)\n" - << " --help Show this help message\n"; - }; - - static struct option long_options[] = { - {"num-streaming-threads", required_argument, nullptr, 1}, - {"num-rows-per-chunk", required_argument, nullptr, 2}, - {"use-shuffle-join", no_argument, nullptr, 3}, - {"output-file", required_argument, nullptr, 4}, - {"input-directory", required_argument, nullptr, 5}, - {"help", no_argument, nullptr, 6}, - {"spill-device-limit", required_argument, nullptr, 7}, - {"num-iterations", required_argument, nullptr, 8}, - {nullptr, 0, nullptr, 0} - }; - - int opt; - int option_index = 0; - - bool saw_output_file = false; - bool saw_input_directory = false; - - while ((opt = getopt_long(argc, argv, "", long_options, &option_index)) != -1) { - switch (opt) { - case 1: - options.num_streaming_threads = std::atoi(optarg); - break; - case 2: - options.num_rows_per_chunk = std::atoi(optarg); - break; - case 3: - options.use_shuffle_join = true; - break; - case 4: - options.output_file = optarg; - saw_output_file = true; - break; - case 5: - options.input_directory = optarg; - saw_input_directory = true; - break; - case 6: - print_usage(); - std::exit(0); - case 7: - options.spill_device_limit = std::stod(optarg); - break; - case 8: - options.num_iterations = std::atoi(optarg); - break; - case '?': - if (optopt == 0 && optind > 1) { - std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; - } - print_usage(); - std::exit(1); - default: - print_usage(); - std::exit(1); - } - } - - // Check if required options were provided - if (!saw_output_file || !saw_input_directory) { - if (!saw_output_file) { - std::cerr << "Error: --output-file is required\n"; - } - if (!saw_input_directory) { - std::cerr << "Error: --input-directory is required\n"; - } - std::cerr << std::endl; - print_usage(); - std::exit(1); - } - - return options; -} - int main(int argc, char** argv) { cudaFree(nullptr); - rapidsmpf::mpi::init(&argc, &argv); - MPI_Comm mpi_comm; - RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); - auto cmd_options = parse_options(argc, argv); - auto limit_size = rmm::percent_of_free_device_memory( - static_cast(cmd_options.spill_device_limit.value_or(1) * 100) - ); - rmm::mr::cuda_async_memory_resource mr{}; - // rmm::mr::cuda_memory_resource base{}; - // rmm::mr::pool_memory_resource mr{&base, pool_size}; - auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); - rmm::device_async_resource_ref mr_ref(stats_mr); - rmm::mr::set_current_device_resource(&stats_mr); - rmm::mr::set_current_device_resource_ref(mr_ref); - std::unordered_map - memory_available{}; - if (cmd_options.spill_device_limit.has_value()) { - memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ - &stats_mr, static_cast(limit_size) - }; - } - auto br = std::make_shared( - stats_mr, std::move(memory_available) - ); - auto envvars = rapidsmpf::config::get_environment_variables(); - envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); - auto options = rapidsmpf::config::Options(envvars); - auto stats = std::make_shared(&stats_mr); - { - auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); - auto progress = - std::make_shared(comm->logger(), stats); - auto ctx = - std::make_shared(options, comm, br, stats); - comm->logger().print( - "Executor has ", ctx->executor()->thread_count(), " threads" - ); - comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); - - std::string output_path = cmd_options.output_file; - std::vector timings; - for (int i = 0; i < cmd_options.num_iterations; i++) { - int op_id{0}; - std::vector nodes; - auto start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q3 pipeline"); - - // Input data channels - auto customer = ctx->create_channel(); - auto lineitem = ctx->create_channel(); - auto orders = ctx->create_channel(); - - // join channels - auto customer_x_orders = ctx->create_channel(); - auto customer_x_orders_x_lineitem = ctx->create_channel(); - - // Out: "c_custkey" - nodes.push_back(read_customer( + auto mr = rmm::mr::cuda_async_memory_resource{}; + auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); + auto arguments = rapidsmpf::ndsh::parse_arguments(argc, argv); + auto ctx = rapidsmpf::ndsh::create_context(arguments, &stats_wrapper); + std::string output_path = arguments.output_file; + std::vector timings; + for (int i = 0; i < arguments.num_iterations; i++) { + int op_id{0}; + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q3 pipeline"); + auto customer = ctx->create_channel(); + auto lineitem = ctx->create_channel(); + auto orders = ctx->create_channel(); + + auto customer_x_orders = ctx->create_channel(); + auto customer_x_orders_x_lineitem = ctx->create_channel(); + + // Out: "c_custkey" + nodes.push_back(read_customer( + ctx, + customer, + /* num_tickets */ 2, + arguments.num_rows_per_chunk, + arguments.input_directory + )); + // Out: o_orderkey, o_orderdate, o_shippriority, o_custkey + nodes.push_back(read_orders( + ctx, orders, 6, arguments.num_rows_per_chunk, arguments.input_directory + )); + // join c_custkey = o_custkey + // Out: o_orderkey, o_orderdate, o_shippriority + nodes.push_back( + rapidsmpf::ndsh::inner_join_broadcast( ctx, customer, - /* num_tickets */ 2, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); - // Out: o_orderkey, o_orderdate, o_shippriority, o_custkey - nodes.push_back(read_orders( - ctx, orders, - 6, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); - // join c_custkey = o_custkey - // Out: o_orderkey, o_orderdate, o_shippriority - nodes.push_back( - rapidsmpf::ndsh::inner_join_broadcast( - ctx, - customer, - orders, - customer_x_orders, - {0}, - {3}, - static_cast(10 * i + op_id++), - rapidsmpf::ndsh::KeepKeys::NO - ) - ); - // Out: l_orderkey, l_extendedprice, l_discount - nodes.push_back(read_lineitem( + customer_x_orders, + {0}, + {3}, + static_cast(10 * i + op_id++), + rapidsmpf::ndsh::KeepKeys::NO + ) + ); + // Out: l_orderkey, l_extendedprice, l_discount + nodes.push_back(read_lineitem( + ctx, + lineitem, + /* num_tickets */ 6, + arguments.num_rows_per_chunk, + arguments.input_directory + )); + + // join o_orderkey = l_orderkey + // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, + // l_discount + nodes.push_back( + rapidsmpf::ndsh::inner_join_broadcast( ctx, + customer_x_orders, lineitem, - /* num_tickets */ 6, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); - - // join o_orderkey = l_orderkey - // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, - // l_discount - nodes.push_back( - rapidsmpf::ndsh::inner_join_broadcast( - ctx, - customer_x_orders, - lineitem, - customer_x_orders_x_lineitem, - {0}, - {0}, - static_cast(10 * i + op_id++), - rapidsmpf::ndsh::KeepKeys::YES - ) - ); + customer_x_orders_x_lineitem, + {0}, + {0}, + static_cast(10 * i + op_id++), + rapidsmpf::ndsh::KeepKeys::YES + ) + ); - auto groupby_input = ctx->create_channel(); - // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back(select_columns_for_groupby( - ctx, customer_x_orders_x_lineitem, groupby_input - )); - auto chunkwise_groupby_output = ctx->create_channel(); - // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back( - chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) - ); - auto concatenated_groupby_output = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::concatenate( - ctx, - chunkwise_groupby_output, - concatenated_groupby_output, - rapidsmpf::ndsh::ConcatOrder::DONT_CARE - ) - ); - auto groupby_output = ctx->create_channel(); - // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back(final_groupby_agg( + auto groupby_input = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back(select_columns_for_groupby( + ctx, customer_x_orders_x_lineitem, groupby_input + )); + auto chunkwise_groupby_output = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back( + chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) + ); + auto concatenated_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::concatenate( ctx, + chunkwise_groupby_output, concatenated_groupby_output, - groupby_output, - static_cast(10 * i + op_id++) - )); - auto topk = ctx->create_channel(); - // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back(top_k( - ctx, - groupby_output, - topk, - {3, 1}, - {cudf::order::DESCENDING, cudf::order::ASCENDING}, - 10 - )); - - nodes.push_back(write_parquet(ctx, topk, output_path)); - } - auto end = std::chrono::steady_clock::now(); - std::chrono::duration pipeline = end - start; - start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Q3 Iteration"); - rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); - } - end = std::chrono::steady_clock::now(); - std::chrono::duration compute = end - start; - comm->logger().print( - "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + rapidsmpf::ndsh::ConcatOrder::DONT_CARE + ) ); - comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); - timings.push_back(pipeline.count()); - timings.push_back(compute.count()); - ctx->comm()->logger().print(stats->report()); - RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + auto groupby_output = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back(final_groupby_agg( + ctx, + concatenated_groupby_output, + groupby_output, + static_cast(10 * i + op_id++) + )); + auto topk = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back(top_k( + ctx, + groupby_output, + topk, + {3, 1}, + {cudf::order::DESCENDING, cudf::order::ASCENDING}, + 10 + )); + + nodes.push_back(write_parquet(ctx, topk, output_path)); } - if (comm->rank() == 0) { - for (int i = 0; i < cmd_options.num_iterations; i++) { - comm->logger().print( - "Iteration ", - i, - " pipeline construction time [s]: ", - timings[size_t(2 * i)] - ); - comm->logger().print( - "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] - ); - } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q3 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(ctx->statistics()->report()); + } + if (ctx->comm()->rank() == 0) { + for (int i = 0; i < arguments.num_iterations; i++) { + ctx->comm()->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + ctx->comm()->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); } } - RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); - RAPIDSMPF_MPI(MPI_Finalize()); + if (rapidsmpf::mpi::is_initialized()) { + RAPIDSMPF_MPI(MPI_Finalize()); + } return 0; } From 50a050a7ae01d680e7e66a039f12938fbf9dd7be Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 17:34:29 +0000 Subject: [PATCH 07/75] And in q1 --- cpp/benchmarks/streaming/ndsh/q01.cpp | 333 ++++++-------------------- 1 file changed, 76 insertions(+), 257 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index e443a3cbe..a0f058177 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -8,53 +8,31 @@ #include #include #include -#include #include #include #include -#include #include #include #include #include -#include -#include -#include #include #include #include -#include -#include #include -#include #include -#include -#include #include -#include #include #include -#include #include #include -#include -#include -#include -#include -#include #include #include -#include -#include -#include -#include -#include #include +#include #include -#include #include #include #include @@ -63,11 +41,7 @@ #include #include "concatenate.hpp" -#include "join.hpp" -#include "rapidsmpf/cuda_stream.hpp" -#include "rapidsmpf/owning_wrapper.hpp" -#include "rapidsmpf/streaming/core/coro_utils.hpp" -#include "utilities.hpp" +#include "utils.hpp" // select // l_orderkey, @@ -351,16 +325,9 @@ std::string get_table_path( auto packed_data = co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); if (ctx->comm()->rank() == 0) { - std::vector chunks; - chunks.reserve(packed_data.size()); - std::ranges::transform( - packed_data, std::back_inserter(chunks), [](auto& chunk) { - return std::move(chunk.data); - } - ); auto global_result = rapidsmpf::unpack_and_concat( rapidsmpf::unspill_partitions( - std::move(chunks), ctx->br(), true, ctx->statistics() + std::move(packed_data), ctx->br(), true, ctx->statistics() ), chunk_stream, ctx->br(), @@ -678,232 +645,84 @@ static __device__ void calculate_charge(double *charge, double discprice, double } } // namespace -struct ProgramOptions { - int num_streaming_threads{1}; - int num_iterations{2}; - cudf::size_type num_rows_per_chunk{100'000'000}; - std::optional spill_device_limit{std::nullopt}; - bool use_shuffle_join = false; - std::string output_file; - std::string input_directory; -}; - -ProgramOptions parse_options(int argc, char** argv) { - ProgramOptions options; - - auto print_usage = [&argv]() { - std::cerr - << "Usage: " << argv[0] << " [options]\n" - << "Options:\n" - << " --num-streaming-threads Number of streaming threads (default: 1)\n" - << " --num-iterations Number of iterations (default: 2)\n" - << " --num-rows-per-chunk Number of rows per chunk (default: " - "100000000)\n" - << " --spill-device-limit Fractional spill device limit (default: " - "None)\n" - << " --use-shuffle-join Use shuffle join (default: false)\n" - << " --output-file Output file path (required)\n" - << " --input-directory Input directory path (required)\n" - << " --help Show this help message\n"; - }; - - static struct option long_options[] = { - {"num-streaming-threads", required_argument, nullptr, 1}, - {"num-rows-per-chunk", required_argument, nullptr, 2}, - {"use-shuffle-join", no_argument, nullptr, 3}, - {"output-file", required_argument, nullptr, 4}, - {"input-directory", required_argument, nullptr, 5}, - {"help", no_argument, nullptr, 6}, - {"spill-device-limit", required_argument, nullptr, 7}, - {"num-iterations", required_argument, nullptr, 8}, - {nullptr, 0, nullptr, 0} - }; - - int opt; - int option_index = 0; - - bool saw_output_file = false; - bool saw_input_directory = false; - - while ((opt = getopt_long(argc, argv, "", long_options, &option_index)) != -1) { - switch (opt) { - case 1: - options.num_streaming_threads = std::atoi(optarg); - break; - case 2: - options.num_rows_per_chunk = std::atoi(optarg); - break; - case 3: - options.use_shuffle_join = true; - break; - case 4: - options.output_file = optarg; - saw_output_file = true; - break; - case 5: - options.input_directory = optarg; - saw_input_directory = true; - break; - case 6: - print_usage(); - std::exit(0); - case 7: - options.spill_device_limit = std::stod(optarg); - break; - case 8: - options.num_iterations = std::atoi(optarg); - break; - case '?': - if (optopt == 0 && optind > 1) { - std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; - } - print_usage(); - std::exit(1); - default: - print_usage(); - std::exit(1); - } - } - - // Check if required options were provided - if (!saw_output_file || !saw_input_directory) { - if (!saw_output_file) { - std::cerr << "Error: --output-file is required\n"; - } - if (!saw_input_directory) { - std::cerr << "Error: --input-directory is required\n"; - } - std::cerr << std::endl; - print_usage(); - std::exit(1); - } - - return options; -} - int main(int argc, char** argv) { cudaFree(nullptr); - rapidsmpf::mpi::init(&argc, &argv); - MPI_Comm mpi_comm; - RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); - auto cmd_options = parse_options(argc, argv); - auto limit_size = rmm::percent_of_free_device_memory( - static_cast(cmd_options.spill_device_limit.value_or(1) * 100) - ); - rmm::mr::cuda_async_memory_resource mr{}; - // rmm::mr::cuda_memory_resource base{}; - // rmm::mr::pool_memory_resource mr{&base, pool_size}; - auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); - rmm::device_async_resource_ref mr_ref(stats_mr); - rmm::mr::set_current_device_resource(&stats_mr); - rmm::mr::set_current_device_resource_ref(mr_ref); - std::unordered_map - memory_available{}; - if (cmd_options.spill_device_limit.has_value()) { - memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ - &stats_mr, static_cast(limit_size) - }; - } - auto br = std::make_shared( - stats_mr, std::move(memory_available) - ); - auto envvars = rapidsmpf::config::get_environment_variables(); - envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); - auto options = rapidsmpf::config::Options(envvars); - auto stats = std::make_shared(&stats_mr); - { - auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); - auto progress = - std::make_shared(comm->logger(), stats); - auto ctx = - std::make_shared(options, comm, br, stats); - comm->logger().print( - "Executor has ", ctx->executor()->thread_count(), " threads" - ); - comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); - - std::string output_path = cmd_options.output_file; - std::vector timings; - [[maybe_unused]] int op_id = 0; - for (int i = 0; i < cmd_options.num_iterations; i++) { - std::vector nodes; - auto start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q1 pipeline"); - - // Input data channels - auto lineitem = ctx->create_channel(); - // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, - // l_discount, l_tax - nodes.push_back(read_lineitem( - ctx, - lineitem, - /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); - - auto groupby_input = ctx->create_channel(); - // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, - // disc_price = (l_extendedprice * (1 - l_discount)), - // charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)) - // l_discount - nodes.push_back(select_columns_for_groupby(ctx, lineitem, groupby_input)); - auto chunkwise_groupby = ctx->create_channel(); - nodes.push_back( - chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby) - ); - auto final_groupby_input = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::concatenate( - ctx, chunkwise_groupby, final_groupby_input - ) - ); - auto groupby_output = ctx->create_channel(); - nodes.push_back(final_groupby_agg( - ctx, - final_groupby_input, - groupby_output, - static_cast(10 * i + op_id++) - )); - auto sorted = ctx->create_channel(); - nodes.push_back(sort_by(ctx, groupby_output, sorted)); - nodes.push_back(write_parquet(ctx, sorted, output_path)); - } - auto end = std::chrono::steady_clock::now(); - std::chrono::duration pipeline = end - start; - start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Q3 Iteration"); - rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); - } - end = std::chrono::steady_clock::now(); - std::chrono::duration compute = end - start; - comm->logger().print( - "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + auto mr = rmm::mr::cuda_async_memory_resource{}; + auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); + auto arguments = rapidsmpf::ndsh::parse_arguments(argc, argv); + auto ctx = rapidsmpf::ndsh::create_context(arguments, &stats_wrapper); + std::string output_path = arguments.output_file; + std::vector timings; + for (int i = 0; i < arguments.num_iterations; i++) { + int op_id = 0; + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q1 pipeline"); + + // Input data channels + auto lineitem = ctx->create_channel(); + // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, + // l_discount, l_tax + nodes.push_back(read_lineitem( + ctx, + lineitem, + /* num_tickets */ 4, + arguments.num_rows_per_chunk, + arguments.input_directory + )); + + auto groupby_input = ctx->create_channel(); + // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, + // disc_price = (l_extendedprice * (1 - l_discount)), + // charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)) + // l_discount + nodes.push_back(select_columns_for_groupby(ctx, lineitem, groupby_input)); + auto chunkwise_groupby = ctx->create_channel(); + nodes.push_back(chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby)); + auto final_groupby_input = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::concatenate(ctx, chunkwise_groupby, final_groupby_input) ); - comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); - timings.push_back(pipeline.count()); - timings.push_back(compute.count()); - ctx->comm()->logger().print(stats->report()); - RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + auto groupby_output = ctx->create_channel(); + nodes.push_back(final_groupby_agg( + ctx, + final_groupby_input, + groupby_output, + static_cast(10 * i + op_id++) + )); + auto sorted = ctx->create_channel(); + nodes.push_back(sort_by(ctx, groupby_output, sorted)); + nodes.push_back(write_parquet(ctx, sorted, output_path)); } - if (comm->rank() == 0) { - for (int i = 0; i < cmd_options.num_iterations; i++) { - comm->logger().print( - "Iteration ", - i, - " pipeline construction time [s]: ", - timings[size_t(2 * i)] - ); - comm->logger().print( - "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] - ); - } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q1 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(ctx->statistics()->report()); + } + if (ctx->comm()->rank() == 0) { + for (int i = 0; i < arguments.num_iterations; i++) { + ctx->comm()->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + ctx->comm()->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); + } + } + if (rapidsmpf::mpi::is_initialized()) { + RAPIDSMPF_MPI(MPI_Finalize()); } - - RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); - RAPIDSMPF_MPI(MPI_Finalize()); return 0; } From f55ddef45cf43f667e367c6d9969fcdc3766e626 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 17:41:19 +0000 Subject: [PATCH 08/75] Q9 --- cpp/benchmarks/streaming/ndsh/q09.cpp | 463 +++++++++----------------- 1 file changed, 166 insertions(+), 297 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q09.cpp b/cpp/benchmarks/streaming/ndsh/q09.cpp index 2275995ee..2653143c6 100644 --- a/cpp/benchmarks/streaming/ndsh/q09.cpp +++ b/cpp/benchmarks/streaming/ndsh/q09.cpp @@ -7,12 +7,10 @@ #include #include #include -#include #include #include #include -#include #include #include @@ -31,22 +29,12 @@ #include #include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include #include -#include #include #include -#include #include #include #include @@ -527,103 +515,6 @@ rapidsmpf::streaming::Node write_parquet( } // namespace -struct ProgramOptions { - int num_streaming_threads{1}; - cudf::size_type num_rows_per_chunk{100'000'000}; - std::optional spill_device_limit{std::nullopt}; - bool use_shuffle_join = false; - std::string output_file; - std::string input_directory; -}; - -ProgramOptions parse_options(int argc, char** argv) { - ProgramOptions options; - - auto print_usage = [&argv]() { - std::cerr - << "Usage: " << argv[0] << " [options]\n" - << "Options:\n" - << " --num-streaming-threads Number of streaming threads (default: 1)\n" - << " --num-rows-per-chunk Number of rows per chunk (default: " - "100000000)\n" - << " --spill-device-limit Fractional spill device limit (default: " - "None)\n" - << " --use-shuffle-join Use shuffle join (default: false)\n" - << " --output-file Output file path (required)\n" - << " --input-directory Input directory path (required)\n" - << " --help Show this help message\n"; - }; - - static struct option long_options[] = { - {"num-streaming-threads", required_argument, nullptr, 1}, - {"num-rows-per-chunk", required_argument, nullptr, 2}, - {"use-shuffle-join", no_argument, nullptr, 3}, - {"output-file", required_argument, nullptr, 4}, - {"input-directory", required_argument, nullptr, 5}, - {"help", no_argument, nullptr, 6}, - {"spill-device-limit", required_argument, nullptr, 7}, - {nullptr, 0, nullptr, 0} - }; - - int opt; - int option_index = 0; - - bool saw_output_file = false; - bool saw_input_directory = false; - - while ((opt = getopt_long(argc, argv, "", long_options, &option_index)) != -1) { - switch (opt) { - case 1: - options.num_streaming_threads = std::atoi(optarg); - break; - case 2: - options.num_rows_per_chunk = std::atoi(optarg); - break; - case 3: - options.use_shuffle_join = true; - break; - case 4: - options.output_file = optarg; - saw_output_file = true; - break; - case 5: - options.input_directory = optarg; - saw_input_directory = true; - break; - case 6: - print_usage(); - std::exit(0); - case 7: - options.spill_device_limit = std::stod(optarg); - break; - case '?': - if (optopt == 0 && optind > 1) { - std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; - } - print_usage(); - std::exit(1); - default: - print_usage(); - std::exit(1); - } - } - - // Check if required options were provided - if (!saw_output_file || !saw_input_directory) { - if (!saw_output_file) { - std::cerr << "Error: --output-file is required\n"; - } - if (!saw_input_directory) { - std::cerr << "Error: --input-directory is required\n"; - } - std::cerr << std::endl; - print_usage(); - std::exit(1); - } - - return options; -} - /** * @brief Run a derived version of TPC-H query 9. * @@ -730,224 +621,202 @@ int main(int argc, char** argv) { nodes.push_back(filter_part(ctx, part, filtered_part)); // p_partkey nodes.push_back(read_partsupp( ctx, + filtered_part, partsupp, - /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); // ps_partkey, ps_suppkey, ps_supplycost - nodes.push_back( - // p_partkey x ps_partkey - rapidsmpf::ndsh::inner_join_broadcast( - ctx, - filtered_part, - partsupp, - part_x_partsupp, - {0}, - {0}, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} - ) // p_partkey/ps_partkey, ps_suppkey, ps_supplycost - ); - nodes.push_back(read_supplier( + part_x_partsupp, + {0}, + {0}, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + ) // p_partkey/ps_partkey, ps_suppkey, ps_supplycost + ); + nodes.push_back(read_supplier( + ctx, + supplier, + /* num_tickets */ 4, + arguments.num_rows_per_chunk, + arguments.input_directory + )); // s_nationkey, s_suppkey + nodes.push_back( + // s_suppkey x ps_suppkey + rapidsmpf::ndsh::inner_join_broadcast( ctx, supplier, - /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); // s_nationkey, s_suppkey - nodes.push_back( - // s_suppkey x ps_suppkey - rapidsmpf::ndsh::inner_join_broadcast( - ctx, - supplier, - part_x_partsupp, - supplier_x_part_x_partsupp, - {1}, - {1}, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} + part_x_partsupp, + supplier_x_part_x_partsupp, + {1}, + {1}, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} - ) // s_nationkey, s_suppkey/ps_suppkey, p_partkey/ps_partkey, - // ps_supplycost - ); - nodes.push_back(read_lineitem( + ) // s_nationkey, s_suppkey/ps_suppkey, p_partkey/ps_partkey, + // ps_supplycost + ); + nodes.push_back(read_lineitem( + ctx, + lineitem, + /* num_tickets */ 4, + arguments.num_rows_per_chunk, + arguments.input_directory + )); // l_discount, l_extendedprice, l_orderkey, l_partkey, l_quantity, + // l_suppkey + nodes.push_back( + // [p_partkey, ps_suppkey] x [l_partkey, l_suppkey] + rapidsmpf::ndsh::inner_join_broadcast( ctx, + supplier_x_part_x_partsupp, lineitem, + supplier_x_part_x_partsupp_x_lineitem, + {2, 1}, + {3, 5}, + rapidsmpf::OpID{static_cast(10 * i + op_id++)}, + rapidsmpf::ndsh::KeepKeys::NO + ) // s_nationkey, ps_supplycost, + // l_discount, l_extendedprice, l_orderkey, l_quantity + ); + auto nation = ctx->create_channel(); + auto orders = ctx->create_channel(); + nodes.push_back( + read_nation( + ctx, + nation, /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); // l_discount, l_extendedprice, l_orderkey, l_partkey, l_quantity, - // l_suppkey + arguments.num_rows_per_chunk, + arguments.input_directory + ) // n_name, n_nationkey + ); + nodes.push_back( + read_orders( + ctx, + orders, + /* num_tickets */ 4, + arguments.num_rows_per_chunk, + arguments.input_directory + ) // o_orderdate, o_orderkey + ); + auto all_joined = ctx->create_channel(); + auto supplier_x_part_x_partsupp_x_lineitem_x_orders = ctx->create_channel(); + if (arguments.use_shuffle_join) { + auto supplier_x_part_x_partsupp_x_lineitem_shuffled = + ctx->create_channel(); + auto orders_shuffled = ctx->create_channel(); + // TODO: customisable + std::uint32_t num_partitions = 16; nodes.push_back( - // [p_partkey, ps_suppkey] x [l_partkey, l_suppkey] - rapidsmpf::ndsh::inner_join_broadcast( + rapidsmpf::ndsh::shuffle( ctx, - supplier_x_part_x_partsupp, - lineitem, supplier_x_part_x_partsupp_x_lineitem, - {2, 1}, - {3, 5}, - rapidsmpf::OpID{static_cast(10 * i + op_id++)}, - rapidsmpf::ndsh::KeepKeys::NO - ) // s_nationkey, ps_supplycost, - // l_discount, l_extendedprice, l_orderkey, l_quantity + supplier_x_part_x_partsupp_x_lineitem_shuffled, + {4}, + num_partitions, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + ) ); - auto nation = ctx->create_channel(); - auto orders = ctx->create_channel(); nodes.push_back( - read_nation( + rapidsmpf::ndsh::shuffle( ctx, - nation, - /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - ) // n_name, n_nationkey + orders, + orders_shuffled, + {1}, + num_partitions, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + ) ); nodes.push_back( - read_orders( + // l_orderkey x o_orderkey + rapidsmpf::ndsh::inner_join_shuffle( ctx, - orders, - /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - ) // o_orderdate, o_orderkey + supplier_x_part_x_partsupp_x_lineitem_shuffled, + orders_shuffled, + supplier_x_part_x_partsupp_x_lineitem_x_orders, + {4}, + {1}, + rapidsmpf::ndsh::KeepKeys::NO + ) // s_nationkey, ps_supplycost, l_discount, l_extendedprice, + // l_quantity, o_orderdate ); - auto all_joined = ctx->create_channel(); - auto supplier_x_part_x_partsupp_x_lineitem_x_orders = - ctx->create_channel(); - if (cmd_options.use_shuffle_join) { - auto supplier_x_part_x_partsupp_x_lineitem_shuffled = - ctx->create_channel(); - auto orders_shuffled = ctx->create_channel(); - // TODO: customisable - std::uint32_t num_partitions = 16; - nodes.push_back( - rapidsmpf::ndsh::shuffle( - ctx, - supplier_x_part_x_partsupp_x_lineitem, - supplier_x_part_x_partsupp_x_lineitem_shuffled, - {4}, - num_partitions, - rapidsmpf::OpID{ - static_cast(10 * i + op_id++) - } - ) - ); - nodes.push_back( - rapidsmpf::ndsh::shuffle( - ctx, - orders, - orders_shuffled, - {1}, - num_partitions, - rapidsmpf::OpID{ - static_cast(10 * i + op_id++) - } - ) - ); - nodes.push_back( - // l_orderkey x o_orderkey - rapidsmpf::ndsh::inner_join_shuffle( - ctx, - supplier_x_part_x_partsupp_x_lineitem_shuffled, - orders_shuffled, - supplier_x_part_x_partsupp_x_lineitem_x_orders, - {4}, - {1}, - rapidsmpf::ndsh::KeepKeys::NO - ) // s_nationkey, ps_supplycost, l_discount, l_extendedprice, - // l_quantity, o_orderdate - ); - } else { - nodes.push_back( - // l_orderkey x o_orderkey - rapidsmpf::ndsh::inner_join_broadcast( - ctx, - supplier_x_part_x_partsupp_x_lineitem, - orders, - supplier_x_part_x_partsupp_x_lineitem_x_orders, - {4}, - {1}, - rapidsmpf::OpID{ - static_cast(10 * i + op_id++) - }, - rapidsmpf::ndsh::KeepKeys::NO - ) // s_nationkey, ps_supplycost, l_discount, l_extendedprice, - // l_quantity, o_orderdate - ); - } + } else { nodes.push_back( - // n_nationkey x s_nationkey + // l_orderkey x o_orderkey rapidsmpf::ndsh::inner_join_broadcast( ctx, - nation, + supplier_x_part_x_partsupp_x_lineitem, + orders, supplier_x_part_x_partsupp_x_lineitem_x_orders, - all_joined, + {4}, {1}, - {0}, rapidsmpf::OpID{static_cast(10 * i + op_id++)}, rapidsmpf::ndsh::KeepKeys::NO - ) // n_name, ps_supplycost, l_discount, l_extendedprice, - // l_quantity, o_orderdate - ); - auto groupby_input = ctx->create_channel(); - nodes.push_back(select_columns(ctx, all_joined, groupby_input)); - auto chunkwise_groupby_output = ctx->create_channel(); - nodes.push_back( - chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) - ); - auto concatenated_groupby_output = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::concatenate( - ctx, - chunkwise_groupby_output, - concatenated_groupby_output, - rapidsmpf::ndsh::ConcatOrder::DONT_CARE - ) + ) // s_nationkey, ps_supplycost, l_discount, l_extendedprice, + // l_quantity, o_orderdate ); - auto groupby_output = ctx->create_channel(); - nodes.push_back(final_groupby_agg( + } + nodes.push_back( + // n_nationkey x s_nationkey + rapidsmpf::ndsh::inner_join_broadcast( ctx, + nation, + supplier_x_part_x_partsupp_x_lineitem_x_orders, + all_joined, + {1}, + {0}, + rapidsmpf::OpID{static_cast(10 * i + op_id++)}, + rapidsmpf::ndsh::KeepKeys::NO + ) // n_name, ps_supplycost, l_discount, l_extendedprice, + // l_quantity, o_orderdate + ); + auto groupby_input = ctx->create_channel(); + nodes.push_back(select_columns(ctx, all_joined, groupby_input)); + auto chunkwise_groupby_output = ctx->create_channel(); + nodes.push_back( + chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) + ); + auto concatenated_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, + chunkwise_groupby_output, concatenated_groupby_output, - groupby_output, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} - )); - auto sorted_output = ctx->create_channel(); - nodes.push_back(sort_by(ctx, groupby_output, sorted_output)); - nodes.push_back(write_parquet(ctx, sorted_output, output_path)); - } - auto end = std::chrono::steady_clock::now(); - std::chrono::duration pipeline = end - start; - start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Q9 Iteration"); - rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); - } - end = std::chrono::steady_clock::now(); - std::chrono::duration compute = end - start; - comm->logger().print( - "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + rapidsmpf::ndsh::ConcatOrder::DONT_CARE + ) ); - comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); - timings.push_back(pipeline.count()); - timings.push_back(compute.count()); - ctx->comm()->logger().print(stats->report()); - RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + auto groupby_output = ctx->create_channel(); + nodes.push_back(final_groupby_agg( + ctx, + concatenated_groupby_output, + groupby_output, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + )); + auto sorted_output = ctx->create_channel(); + nodes.push_back(sort_by(ctx, groupby_output, sorted_output)); + nodes.push_back(write_parquet(ctx, sorted_output, output_path)); } - if (comm->rank() == 0) { - for (int i = 0; i < 2; i++) { - comm->logger().print( - "Iteration ", - i, - " pipeline construction time [s]: ", - timings[size_t(2 * i)] - ); - comm->logger().print( - "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] - ); - } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q9 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(ctx->statistics()->report()); + } + if (ctx->comm()->rank() == 0) { + for (int i = 0; i < arguments.num_iterations; i++) { + ctx->comm()->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + ctx->comm()->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); + } + } + if (rapidsmpf::mpi::is_initialized()) { + RAPIDSMPF_MPI(MPI_Finalize()); } - - RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); - RAPIDSMPF_MPI(MPI_Finalize()); return 0; } From 25e60de735f622491235fa483cef21b3f57deb3e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 17:41:30 +0000 Subject: [PATCH 09/75] Docstring for main --- cpp/benchmarks/streaming/ndsh/q01.cpp | 54 +++++++++++++++------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index a0f058177..fc6b11c16 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -43,30 +43,6 @@ #include "concatenate.hpp" #include "utils.hpp" -// select -// l_orderkey, -// sum(l_extendedprice * (1 - l_discount)) as revenue, -// o_orderdate, -// o_shippriority -// from -// customer, -// orders, -// lineitem -// where -// c_mktsegment = 'BUILDING' -// and c_custkey = o_custkey -// and l_orderkey = o_orderkey -// and o_orderdate < '1995-03-15' -// and l_shipdate > '1995-03-15' -// group by -// l_orderkey, -// o_orderdate, -// o_shippriority -// order by -// revenue desc, -// o_orderdate -// limit 10 - namespace { std::string get_table_path( @@ -645,6 +621,36 @@ static __device__ void calculate_charge(double *charge, double discprice, double } } // namespace +/** + * @brief Run a derived version of TPC-H query 1. + * + * The SQL form of the query is: + * @code{.sql} + * select + * l_orderkey, + * sum(l_extendedprice * (1 - l_discount)) as revenue, + * o_orderdate, + * o_shippriority + * from + * customer, + * orders, + * lineitem + * where + * c_mktsegment = 'BUILDING' + * and c_custkey = o_custkey + * and l_orderkey = o_orderkey + * and o_orderdate < '1995-03-15' + * and l_shipdate > '1995-03-15' + * group by + * l_orderkey, + * o_orderdate, + * o_shippriority + * order by + * revenue desc, + * o_orderdate + * limit 10 + * @endcode{} + */ int main(int argc, char** argv) { cudaFree(nullptr); auto mr = rmm::mr::cuda_async_memory_resource{}; From 195e71ad8c0a46141c7e2b6ca63ea57205390899 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Dec 2025 17:42:56 +0000 Subject: [PATCH 10/75] Docstring --- cpp/benchmarks/streaming/ndsh/q03.cpp | 39 +++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 37671f309..a8dd433c3 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -607,6 +607,45 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl } } // namespace +/** + * @brief Run a derived version of TPC-H query 1. + * + * The SQL form of the query is: + * @code{.sql} + * select + * nation, + * o_year, + * round(sum(amount), 2) as sum_profit + * from + * ( + * select + * n_name as nation, + * year(o_orderdate) as o_year, + * l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + * from + * part, + * supplier, + * lineitem, + * partsupp, + * orders, + * nation + * where + * s_suppkey = l_suppkey + * and ps_suppkey = l_suppkey + * and ps_partkey = l_partkey + * and p_partkey = l_partkey + * and o_orderkey = l_orderkey + * and s_nationkey = n_nationkey + * and p_name like '%green%' + * ) as profit + * group by + * nation, + * o_year + * order by + * nation, + * o_year desc + * @endcode{} + */ int main(int argc, char** argv) { cudaFree(nullptr); auto mr = rmm::mr::cuda_async_memory_resource{}; From 012e9fb10ddc1f1df59fba60b66e2b709640b0ab Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Dec 2025 14:32:20 +0000 Subject: [PATCH 11/75] Make broadcast public --- cpp/benchmarks/streaming/ndsh/join.cpp | 28 ++++++-------------------- cpp/benchmarks/streaming/ndsh/join.hpp | 21 +++++++++++++++++++ 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 271b6e5c0..3e0eafcb6 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -39,27 +39,11 @@ namespace rapidsmpf::ndsh { -namespace { - -/** - * @brief Broadcast the concatenation of all input messages to all ranks. - * - * @note Receives all input chunks, gathers from all ranks, and then provides concatenated - * output. - * - * @note Since this is used for unordered joins, the input order of `ch_in` across ranks - * is not preserved in the output. - * - * @param ctx Streaming context - * @param ch_in Input channel of `TableChunk`s - * @param tag Disambiguating tag for allgather - * - * @return Message containing the concatenation of all the input table chunks. - */ coro::task broadcast( std::shared_ptr ctx, std::shared_ptr ch_in, - OpID tag + OpID tag, + streaming::AllGather::Ordered ordered ) { streaming::ShutdownAtExit c{ch_in}; co_await ctx->executor()->schedule(); @@ -117,7 +101,7 @@ coro::task broadcast( gatherer.insert(msg.sequence_number(), {std::move(packed_data)}); } gatherer.insert_finished(); - auto result = co_await gatherer.extract_all(streaming::AllGather::Ordered::NO); + auto result = co_await gatherer.extract_all(ordered); if (result.size() == 1) { co_return streaming::to_message( 0, @@ -222,8 +206,6 @@ streaming::Message inner_join_chunk( ); } -} // namespace - streaming::Node inner_join_broadcast( std::shared_ptr ctx, // We will always choose left as build table and do "broadcast" joins @@ -239,7 +221,9 @@ streaming::Node inner_join_broadcast( co_await ctx->executor()->schedule(); ctx->comm()->logger().print("Inner broadcast join ", static_cast(tag)); auto build_table = to_device( - ctx, (co_await broadcast(ctx, left, tag)).release() + ctx, + (co_await broadcast(ctx, left, tag, streaming::AllGather::Ordered::NO)) + .release() ); ctx->comm()->logger().print( "Build table has ", build_table.table_view().num_rows(), " rows" diff --git a/cpp/benchmarks/streaming/ndsh/join.hpp b/cpp/benchmarks/streaming/ndsh/join.hpp index ddd799112..2178d873a 100644 --- a/cpp/benchmarks/streaming/ndsh/join.hpp +++ b/cpp/benchmarks/streaming/ndsh/join.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -21,6 +22,26 @@ enum class KeepKeys : bool { YES, ///< Key columns do appear in the output }; +/** + * @brief Broadcast the concatenation of all input messages to all ranks. + * + * @note Receives all input chunks, gathers from all ranks, and then provides concatenated + * output. + * + * @param ctx Streaming context + * @param ch_in Input channel of `TableChunk`s + * @param tag Disambiguating tag for allgather + * @param ordered Should the concatenated output be ordered + * + * @return Message containing the concatenation of all the input table chunks. + */ +coro::task broadcast( + std::shared_ptr ctx, + std::shared_ptr ch_in, + OpID tag, + streaming::AllGather::Ordered ordered = streaming::AllGather::Ordered::YES +); + /** * @brief Perform a streaming inner join between two tables. * From 83e73b163a537cca0f009e4f5de2233585392942 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Dec 2025 17:41:02 +0000 Subject: [PATCH 12/75] Whack a load of stuff in --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 2 +- cpp/benchmarks/streaming/ndsh/concatenate.cpp | 2 +- cpp/benchmarks/streaming/ndsh/groupby.cpp | 77 +++ cpp/benchmarks/streaming/ndsh/groupby.hpp | 51 ++ cpp/benchmarks/streaming/ndsh/join.cpp | 17 +- cpp/benchmarks/streaming/ndsh/join.hpp | 30 +- .../streaming/ndsh/parquet_writer.cpp | 63 ++ .../streaming/ndsh/parquet_writer.hpp | 34 ++ cpp/benchmarks/streaming/ndsh/q01.cpp | 568 +++++------------- cpp/benchmarks/streaming/ndsh/q03.cpp | 49 +- cpp/benchmarks/streaming/ndsh/sort.cpp | 67 +++ cpp/benchmarks/streaming/ndsh/sort.hpp | 39 ++ cpp/benchmarks/streaming/ndsh/utils.cpp | 22 +- cpp/benchmarks/streaming/ndsh/utils.hpp | 32 +- 14 files changed, 598 insertions(+), 455 deletions(-) create mode 100644 cpp/benchmarks/streaming/ndsh/groupby.cpp create mode 100644 cpp/benchmarks/streaming/ndsh/groupby.hpp create mode 100644 cpp/benchmarks/streaming/ndsh/parquet_writer.cpp create mode 100644 cpp/benchmarks/streaming/ndsh/parquet_writer.hpp create mode 100644 cpp/benchmarks/streaming/ndsh/sort.cpp create mode 100644 cpp/benchmarks/streaming/ndsh/sort.hpp diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 0d99dceb6..f5e53096c 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -13,7 +13,7 @@ if(NOT RAPIDSMPF_HAVE_STREAMING) message(FATAL_ERROR "Streaming NDSH benchmarks require streaming support") endif() -add_library(rapidsmpfndsh concatenate.cpp join.cpp utils.cpp) +add_library(rapidsmpfndsh concatenate.cpp groupby.cpp join.cpp parquet_writer.cpp sort.cpp utils.cpp) set_target_properties( rapidsmpfndsh diff --git a/cpp/benchmarks/streaming/ndsh/concatenate.cpp b/cpp/benchmarks/streaming/ndsh/concatenate.cpp index 5fd57b76d..1c40c12b0 100644 --- a/cpp/benchmarks/streaming/ndsh/concatenate.cpp +++ b/cpp/benchmarks/streaming/ndsh/concatenate.cpp @@ -35,7 +35,7 @@ streaming::Node concatenate( std::vector messages; ctx->comm()->logger().print("Concatenate"); auto concat_stream = ctx->br()->stream_pool().get_stream(); - while (true) { + while (!ch_out->is_shutdown()) { co_await ctx->executor()->schedule(); auto msg = co_await ch_in->receive(); if (msg.empty()) { diff --git a/cpp/benchmarks/streaming/ndsh/groupby.cpp b/cpp/benchmarks/streaming/ndsh/groupby.cpp new file mode 100644 index 000000000..651bc0b84 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/groupby.cpp @@ -0,0 +1,77 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "groupby.hpp" + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "utils.hpp" + +namespace rapidsmpf::ndsh { + +streaming::Node chunkwise_group_by( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::vector requests, + cudf::null_policy null_policy +) { + streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + while (!ch_out->is_shutdown()) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = to_device(ctx, msg.release(), true); + auto stream = chunk.stream(); + auto table = chunk.table_view(); + auto agg_requests = std::vector(); + agg_requests.reserve(requests.size()); + std::ranges::transform( + requests, std::back_inserter(agg_requests), [&table](auto&& req) { + std::vector> reqs; + for (auto&& x : req.requests) { + reqs.push_back(x()); + } + return cudf::groupby::aggregation_request{ + table.column(req.column_idx), std::move(reqs) + }; + } + ); + auto grouper = + cudf::groupby::groupby(table.select(keys), null_policy, cudf::sorted::NO); + + auto [keys, aggregated] = + grouper.aggregate(agg_requests, stream, ctx->br()->device_mr()); + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& a : aggregated) { + std::ranges::move(a.results, std::back_inserter(result)); + } + co_await ch_out->send( + streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::make_unique(std::move(result)), stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/groupby.hpp b/cpp/benchmarks/streaming/ndsh/groupby.hpp new file mode 100644 index 000000000..46eb24b50 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/groupby.hpp @@ -0,0 +1,51 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace rapidsmpf::ndsh { + +///< @brief Description of aggregation requests on a given column +struct groupby_request { + cudf::size_type column_idx; ///< Index of column in input table to aggregate + std::vector()>> + requests; ///< Functions to generate aggregations to perform on the column +}; + +/** + * @brief Perform a chunkwise grouped aggregation. + * + * @note Grouped chunks are not futher grouped together. + * + * @param ctx Streaming context. + * @param ch_in `TableChunk`s to aggregate + * @param ch_out Output channel of grouped `TableChunk`s + * @param keys Column indices of the key columns in the input channel. + * @param requests Vector of aggregation requests referencing columns in the input + * channel. + * @param null_policy How nulls in the key columns are treated. + * + * @return Coroutine representing the completion of the aggregation. + */ +streaming::Node chunkwise_group_by( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::vector requests, + cudf::null_policy null_policy + +); +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 3e0eafcb6..94faf71dd 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -129,6 +129,19 @@ coro::task broadcast( } } +streaming::Node broadcast( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + OpID tag, + streaming::AllGather::Ordered ordered +) { + streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + co_await ch_out->send(co_await broadcast(ctx, ch_in, tag, ordered)); + co_await ch_out->drain(ctx->executor()); +} + /** * @brief Join a table chunk against a build hash table returning a message of the result. * @@ -249,7 +262,7 @@ streaming::Node inner_join_broadcast( build_carrier = build_table.table_view().select(to_keep); } std::size_t sequence = 0; - while (true) { + while (!ch_out->is_shutdown()) { auto right_msg = co_await right->receive(); if (right_msg.empty()) { break; @@ -282,7 +295,7 @@ streaming::Node inner_join_shuffle( ctx->comm()->logger().print("Inner shuffle join"); co_await ctx->executor()->schedule(); CudaEvent build_event; - while (true) { + while (!ch_out->is_shutdown()) { // Requirement: two shuffles kick out partitions in the same order auto left_msg = co_await left->receive(); auto right_msg = co_await right->receive(); diff --git a/cpp/benchmarks/streaming/ndsh/join.hpp b/cpp/benchmarks/streaming/ndsh/join.hpp index 2178d873a..c6bcbab2a 100644 --- a/cpp/benchmarks/streaming/ndsh/join.hpp +++ b/cpp/benchmarks/streaming/ndsh/join.hpp @@ -35,13 +35,35 @@ enum class KeepKeys : bool { * * @return Message containing the concatenation of all the input table chunks. */ -coro::task broadcast( +[[nodiscard]] coro::task broadcast( std::shared_ptr ctx, std::shared_ptr ch_in, OpID tag, streaming::AllGather::Ordered ordered = streaming::AllGather::Ordered::YES ); +/** + * @brief Broadcast the concatenation of all input messages to all ranks. + * + * @note Receives all input chunks, gathers from all ranks, and then provides concatenated + * output. + * + * @param ctx Streaming context + * @param ch_in Input channel of `TableChunk`s + * @param ch_out Input channel of a single `TableChunk` + * @param tag Disambiguating tag for allgather + * @param ordered Should the concatenated output be ordered + * + * @return Coroutine representing the broadcast + */ +[[nodiscard]] streaming::Node broadcast( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + OpID tag, + streaming::AllGather::Ordered ordered = streaming::AllGather::Ordered::YES +); + /** * @brief Perform a streaming inner join between two tables. * @@ -60,7 +82,7 @@ coro::task broadcast( * * @return Coroutine representing the completion of the join. */ -streaming::Node inner_join_broadcast( +[[nodiscard]] streaming::Node inner_join_broadcast( std::shared_ptr ctx, // We will always choose left as build table and do "broadcast" joins std::shared_ptr left, @@ -88,7 +110,7 @@ streaming::Node inner_join_broadcast( * * @return Coroutine representing the completion of the join. */ -streaming::Node inner_join_shuffle( +[[nodiscard]] streaming::Node inner_join_shuffle( std::shared_ptr ctx, std::shared_ptr left, std::shared_ptr right, @@ -110,7 +132,7 @@ streaming::Node inner_join_shuffle( * * @return Coroutine representing the completion of the shuffle. */ -streaming::Node shuffle( +[[nodiscard]] streaming::Node shuffle( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out, diff --git a/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp b/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp new file mode 100644 index 000000000..f5a209ce0 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp @@ -0,0 +1,63 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "parquet_writer.hpp" + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "utils.hpp" + +namespace rapidsmpf::ndsh { + +rapidsmpf::streaming::Node write_parquet( + std::shared_ptr ctx, + std::shared_ptr ch_in, + cudf::io::sink_info sink, + std::vector column_names +) { + streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + auto builder = cudf::io::chunked_parquet_writer_options::builder(sink); + auto msg = co_await ch_in->receive(); + RAPIDSMPF_EXPECTS(!msg.empty(), "Writing from empty channel not supported"); + auto chunk = to_device(ctx, msg.release()); + auto table = chunk.table_view(); + auto metadata = cudf::io::table_input_metadata(table); + RAPIDSMPF_EXPECTS( + column_names.size() == metadata.column_metadata.size(), + "Mismatching number of column names and chunk columns" + ); + for (std::size_t i = 0; i < column_names.size(); i++) { + metadata.column_metadata[i].set_name(column_names[i]); + } + builder = builder.metadata(metadata); + auto options = builder.build(); + auto writer = cudf::io::chunked_parquet_writer(options); + writer.write(table); + while (true) { + msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + chunk = to_device(ctx, msg.release()); + table = chunk.table_view(); + RAPIDSMPF_EXPECTS( + static_cast(table.num_columns()) == column_names.size(), + "Mismatching number of column names and chunk columns" + ); + writer.write(table); + } + writer.close(); +} +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/parquet_writer.hpp b/cpp/benchmarks/streaming/ndsh/parquet_writer.hpp new file mode 100644 index 000000000..1451620e1 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/parquet_writer.hpp @@ -0,0 +1,34 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include + +#include +#include + +#include +#include + +namespace rapidsmpf::ndsh { + +/** + * @brief Write chunks in a channel to an output sink + * + * @param ctx Streaming context + * @param ch_in Input channel of `TableChunk`s + * @param sink Sink to write into + * @param column_names Names of the columns to add to the parquet metadata + * + * @return Coroutine representing the write + */ +[[nodiscard]] rapidsmpf::streaming::Node write_parquet( + std::shared_ptr ctx, + std::shared_ptr ch_in, + cudf::io::sink_info sink, + std::vector column_names +); +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index fc6b11c16..3bda2f98c 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -3,11 +3,9 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include #include #include #include -#include #include #include @@ -17,11 +15,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -30,8 +26,6 @@ #include #include -#include -#include #include #include #include @@ -40,25 +34,15 @@ #include #include -#include "concatenate.hpp" +#include "groupby.hpp" +#include "join.hpp" +#include "parquet_writer.hpp" +#include "sort.hpp" #include "utils.hpp" namespace { -std::string get_table_path( - std::string const& input_directory, std::string const& table_name -) { - auto dir = input_directory.empty() ? "." : input_directory; - auto file_path = dir + "/" + table_name + ".parquet"; - - if (std::filesystem::exists(file_path)) { - return file_path; - } - - return dir + "/" + table_name + "/"; -} - -[[maybe_unused]] rapidsmpf::streaming::Node read_lineitem( +rapidsmpf::streaming::Node read_lineitem( std::shared_ptr ctx, std::shared_ptr ch_out, std::size_t num_producers, @@ -66,7 +50,7 @@ std::string get_table_path( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "lineitem") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "lineitem") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({ @@ -125,311 +109,88 @@ std::string get_table_path( ); } -// l_returnflag, l_linestatus, l_quantity, l_extendedprice, -// disc_price = (l_extendedprice * (1 - l_discount)), -// charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)) -// l_discount -[[maybe_unused]] rapidsmpf::streaming::Node chunkwise_groupby_agg( - [[maybe_unused]] std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - std::vector partial_results; - std::uint64_t sequence = 0; - ctx->comm()->logger().print("Chunkwise groupby"); - auto grouper = [&]() -> coro::task { - while (true) { - auto msg = co_await ch_in->receive(); - co_await ctx->executor()->schedule(); - if (msg.empty()) { - break; - } - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); +std::vector chunkwise_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + // sum(l_quantity), sum(l_extendedprice), sum(disc_price), sum(charge), + // sum(l_discount) + for (cudf::size_type idx = 2; idx < 7; idx++) { + aggs.emplace_back(cudf::make_sum_aggregation); + requests.emplace_back(idx, std::move(aggs)); + } + // count(*) + aggs.emplace_back([]() { + return cudf::make_count_aggregation( + cudf::null_policy::INCLUDE + ); + }); + requests.emplace_back(0, std::move(aggs)); + return requests; +} - auto grouper = cudf::groupby::groupby( - // group by [l_returnflag, l_linestatus] - table.select({0, 1}), - cudf::null_policy::EXCLUDE, - cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_quantity) - cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_extendedprice) - cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(disc_price) - cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(charge) - cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_discount) - cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) - ); - aggs.push_back( - cudf::make_count_aggregation( - cudf::null_policy::INCLUDE - ) - ); - requests.push_back( - // count(*) - cudf::groupby::aggregation_request(table.column(0), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - sequence++, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } - }; - rapidsmpf::streaming::coro_results( - co_await coro::when_all(grouper(), grouper(), grouper(), grouper()) - ); - co_await ch_out->drain(ctx->executor()); +std::vector final_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + // sum(l_quantity), sum(l_extendedprice), sum(disc_price), sum(charge), + // sum(l_discount), sum(count(*)) + for (cudf::size_type idx = 2; idx < 8; idx++) { + aggs.emplace_back(cudf::make_sum_aggregation); + requests.emplace_back(idx, std::move(aggs)); + } + return requests; } -[[maybe_unused]] rapidsmpf::streaming::Node final_groupby_agg( - [[maybe_unused]] std::shared_ptr ctx, +rapidsmpf::streaming::Node postprocess_group_by( + std::shared_ptr ctx, std::shared_ptr ch_in, - std::shared_ptr ch_out, - rapidsmpf::OpID tag + std::shared_ptr ch_out ) { rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; co_await ctx->executor()->schedule(); - // TODO: requires concatenated input stream. auto msg = co_await ch_in->receive(); - auto next = co_await ch_in->receive(); - ctx->comm()->logger().print("Final groupby"); - RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); + RAPIDSMPF_EXPECTS( + (co_await ch_in->receive()).empty(), "Expecting concatenated input at this point" + ); auto chunk = rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - std::unique_ptr local_result{nullptr}; - if (!table.is_empty()) { - auto grouper = cudf::groupby::groupby( - table.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_quantity) - cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_extendedprice) - cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(disc_price) - cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(charge) - cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_discount) - cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(count(*)) - cudf::groupby::aggregation_request(table.column(7), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - local_result = std::make_unique(std::move(result)); - } - if (ctx->comm()->nranks() > 1) { - // Reduce across ranks... - // Need a reduce primitive in rapidsmpf, but let's just use an allgather and - // discard for now. - rapidsmpf::streaming::AllGather gatherer{ctx, tag}; - if (local_result) { - auto pack = - cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); - gatherer.insert( - 0, - {rapidsmpf::PackedData( - std::move(pack.metadata), - ctx->br()->move(std::move(pack.gpu_data), chunk_stream) - )} - ); - } - gatherer.insert_finished(); - auto packed_data = - co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); - if (ctx->comm()->rank() == 0) { - auto global_result = rapidsmpf::unpack_and_concat( - rapidsmpf::unspill_partitions( - std::move(packed_data), ctx->br(), true, ctx->statistics() - ), - chunk_stream, - ctx->br(), - ctx->statistics() - ); - auto table = global_result->view(); - auto grouper = cudf::groupby::groupby( - table.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_quantity) - cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_extendedprice) - cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(disc_price) - cudf::groupby::aggregation_request(table.column(4), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(charge) - cudf::groupby::aggregation_request(table.column(5), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(l_discount) - cudf::groupby::aggregation_request(table.column(6), std::move(aggs)) - ); - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - // sum(count(*)) - cudf::groupby::aggregation_request(table.column(7), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - auto count = std::move(result.back()); - result.pop_back(); - auto discount = std::move(result.back()); - result.pop_back(); - for (std::size_t i = 2; i < 4; i++) { - result.push_back( - cudf::binary_operation( - result[i]->view(), - count->view(), - cudf::binary_operator::TRUE_DIV, - cudf::data_type(cudf::type_id::FLOAT64), - chunk_stream, - ctx->br()->device_mr() - ) - ); - } - result.push_back( - cudf::binary_operation( - discount->view(), - count->view(), - cudf::binary_operator::TRUE_DIV, - cudf::data_type(cudf::type_id::FLOAT64), - chunk_stream, - ctx->br()->device_mr() - ) - ); - - result.push_back(std::move(count)); - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } else { - std::ignore = std::move(packed_data); - } - } else { - auto result = local_result->release(); - auto count = std::move(result.back()); - result.pop_back(); - auto discount = std::move(result.back()); - result.pop_back(); - for (std::size_t i = 2; i < 4; i++) { - result.push_back( - cudf::binary_operation( - result[i]->view(), - count->view(), - cudf::binary_operator::TRUE_DIV, - cudf::data_type(cudf::type_id::FLOAT64), - chunk_stream, - ctx->br()->device_mr() - ) - ); - } - result.push_back( + auto stream = chunk.stream(); + auto columns = cudf::table{chunk.table_view()}.release(); + std::ignore = std::move(chunk); + auto count = std::move(columns.back()); + columns.pop_back(); + auto discount = std::move(columns.back()); + columns.pop_back(); + for (std::size_t i = 2; i < 4; i++) { + columns.push_back( cudf::binary_operation( - discount->view(), + columns[i]->view(), count->view(), cudf::binary_operator::TRUE_DIV, cudf::data_type(cudf::type_id::FLOAT64), - chunk_stream, + stream, ctx->br()->device_mr() ) ); - result.push_back(std::move(count)); - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); } + columns.push_back( + cudf::binary_operation( + discount->view(), + count->view(), + cudf::binary_operator::TRUE_DIV, + cudf::data_type(cudf::type_id::FLOAT64), + stream, + ctx->br()->device_mr() + ) + ); + columns.push_back(std::move(count)); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::make_unique(std::move(columns)), stream + ) + ) + ); co_await ch_out->drain(ctx->executor()); } @@ -439,7 +200,7 @@ std::string get_table_path( // disc_price = (l_extendedprice * (1 - l_discount)), // charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)), // l_discount -[[maybe_unused]] rapidsmpf::streaming::Node select_columns_for_groupby( +rapidsmpf::streaming::Node select_columns_for_groupby( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out @@ -447,7 +208,7 @@ std::string get_table_path( rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; co_await ctx->executor()->schedule(); - while (true) { + while (!ch_out->is_shutdown()) { auto msg = co_await ch_in->receive(); if (msg.empty()) { break; @@ -521,81 +282,6 @@ static __device__ void calculate_charge(double *charge, double discprice, double co_await ch_out->drain(ctx->executor()); } -[[maybe_unused]] rapidsmpf::streaming::Node sort_by( - [[maybe_unused]] std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - // We know we only have a single chunk from the groupby - if (msg.empty()) { - co_return; - } - ctx->comm()->logger().print("Sortby"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto table = chunk.table_view(); - auto result = rapidsmpf::streaming::to_message( - 0, - std::make_unique( - cudf::sort_by_key( - table, - table.select({0, 1}), - {cudf::order::ASCENDING, cudf::order::ASCENDING}, - {cudf::null_order::BEFORE, cudf::null_order::BEFORE}, - chunk.stream(), - ctx->br()->device_mr() - ), - chunk.stream() - ) - ); - co_await ch_out->send(std::move(result)); - co_await ch_out->drain(ctx->executor()); -} - -// In: o_orderkey, o_orderdate, o_shippriority, revenue -[[maybe_unused]] rapidsmpf::streaming::Node write_parquet( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::string output_path -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - co_return; - } - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto sink = cudf::io::sink_info(output_path); - auto table = chunk.table_view(); - auto builder = cudf::io::parquet_writer_options::builder(sink, table); - auto metadata = cudf::io::table_input_metadata(table); - metadata.column_metadata[0].set_name("l_returnflag"); - metadata.column_metadata[1].set_name("l_linestatus"); - metadata.column_metadata[2].set_name("sum_qty"); - metadata.column_metadata[3].set_name("sum_base_price"); - metadata.column_metadata[4].set_name("sum_disc_price"); - metadata.column_metadata[5].set_name("sum_charge"); - metadata.column_metadata[6].set_name("avg_qty"); - metadata.column_metadata[7].set_name("avg_price"); - metadata.column_metadata[8].set_name("avg_disc"); - metadata.column_metadata[9].set_name("count_order"); - builder = builder.metadata(metadata); - auto options = builder.build(); - cudf::io::write_parquet(options, chunk.stream()); - ctx->comm()->logger().print( - "Wrote chunk with ", - chunk.table_view().num_rows(), - " rows and ", - chunk.table_view().num_columns(), - " columns to ", - output_path - ); -} - [[maybe_unused]] rapidsmpf::streaming::Node consume( [[maybe_unused]] std::shared_ptr ctx, std::shared_ptr ch_in @@ -627,28 +313,26 @@ static __device__ void calculate_charge(double *charge, double discprice, double * The SQL form of the query is: * @code{.sql} * select - * l_orderkey, - * sum(l_extendedprice * (1 - l_discount)) as revenue, - * o_orderdate, - * o_shippriority + * l_returnflag, + * l_linestatus, + * sum(l_quantity) as sum_qty, + * sum(l_extendedprice) as sum_base_price, + * sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + * sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + * avg(l_quantity) as avg_qty, + * avg(l_extendedprice) as avg_price, + * avg(l_discount) as avg_disc, + * count(*) as count_order * from - * customer, - * orders, * lineitem * where - * c_mktsegment = 'BUILDING' - * and c_custkey = o_custkey - * and l_orderkey = o_orderkey - * and o_orderdate < '1995-03-15' - * and l_shipdate > '1995-03-15' + * l_shipdate <= DATE '1998-09-02' * group by - * l_orderkey, - * o_orderdate, - * o_shippriority + * l_returnflag, + * l_linestatus * order by - * revenue desc, - * o_orderdate - * limit 10 + * l_returnflag, + * l_linestatus * @endcode{} */ int main(int argc, char** argv) { @@ -685,21 +369,75 @@ int main(int argc, char** argv) { // l_discount nodes.push_back(select_columns_for_groupby(ctx, lineitem, groupby_input)); auto chunkwise_groupby = ctx->create_channel(); - nodes.push_back(chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby)); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + groupby_input, + chunkwise_groupby, + {0, 1}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE + ) + ); auto final_groupby_input = ctx->create_channel(); nodes.push_back( - rapidsmpf::ndsh::concatenate(ctx, chunkwise_groupby, final_groupby_input) + rapidsmpf::ndsh::broadcast( + ctx, + chunkwise_groupby, + final_groupby_input, + static_cast(10 * i + op_id++), + rapidsmpf::streaming::AllGather::Ordered::NO + ) ); - auto groupby_output = ctx->create_channel(); - nodes.push_back(final_groupby_agg( - ctx, - final_groupby_input, - groupby_output, - static_cast(10 * i + op_id++) - )); - auto sorted = ctx->create_channel(); - nodes.push_back(sort_by(ctx, groupby_output, sorted)); - nodes.push_back(write_parquet(ctx, sorted, output_path)); + auto final_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0, 1}, + final_groupby_requests(), + cudf::null_policy::INCLUDE + ) + ); + if (ctx->comm()->rank() == 0) { + auto sorted_input = ctx->create_channel(); + nodes.push_back( + postprocess_group_by(ctx, final_groupby_output, sorted_input) + ); + auto sorted_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_sort_by( + ctx, + sorted_input, + sorted_output, + {0, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + {cudf::order::ASCENDING, cudf::order::ASCENDING}, + {cudf::null_order::BEFORE, cudf::null_order::BEFORE} + ) + ); + nodes.push_back( + rapidsmpf::ndsh::write_parquet( + ctx, + sorted_output, + cudf::io::sink_info(output_path), + {"l_returnflag", + "l_linestatus", + "sum_qty", + "sum_base_price", + "sum_disc_price", + "sum_charge", + "avg_qty", + "avg_price", + "avg_disc", + "count_order"} + + ) + ); + } else { + nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_output)); + } } auto end = std::chrono::steady_clock::now(); std::chrono::duration pipeline = end - start; diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index a8dd433c3..170e28534 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -608,42 +608,33 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl } // namespace /** - * @brief Run a derived version of TPC-H query 1. + * @brief Run a derived version of TPC-H query 3. * * The SQL form of the query is: * @code{.sql} * select - * nation, - * o_year, - * round(sum(amount), 2) as sum_profit + * l_orderkey, + * sum(l_extendedprice * (1 - l_discount)) as revenue, + * o_orderdate, + * o_shippriority * from - * ( - * select - * n_name as nation, - * year(o_orderdate) as o_year, - * l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount - * from - * part, - * supplier, - * lineitem, - * partsupp, - * orders, - * nation - * where - * s_suppkey = l_suppkey - * and ps_suppkey = l_suppkey - * and ps_partkey = l_partkey - * and p_partkey = l_partkey - * and o_orderkey = l_orderkey - * and s_nationkey = n_nationkey - * and p_name like '%green%' - * ) as profit + * customer, + * orders, + * lineitem + * where + * c_mktsegment = 'BUILDING' + * and c_custkey = o_custkey + * and l_orderkey = o_orderkey + * and o_orderdate < '1995-03-15' + * and l_shipdate > '1995-03-15' * group by - * nation, - * o_year + * l_orderkey, + * o_orderdate, + * o_shippriority * order by - * nation, - * o_year desc + * revenue desc, + * o_orderdate + * limit 10 * @endcode{} */ int main(int argc, char** argv) { diff --git a/cpp/benchmarks/streaming/ndsh/sort.cpp b/cpp/benchmarks/streaming/ndsh/sort.cpp new file mode 100644 index 000000000..5871d9fd2 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/sort.cpp @@ -0,0 +1,67 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "sort.hpp" + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "utils.hpp" + +namespace rapidsmpf::ndsh { + +rapidsmpf::streaming::Node chunkwise_sort_by( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::vector values, + std::vector order, + std::vector null_order +) { + streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + auto make_table = [&](streaming::TableChunk& chunk) { + if (std::ranges::equal(keys, values)) { + return cudf::sort( + chunk.table_view().select(keys), + order, + null_order, + chunk.stream(), + ctx->br()->device_mr() + ); + } else { + return cudf::sort_by_key( + chunk.table_view().select(values), + chunk.table_view().select(keys), + order, + null_order, + chunk.stream(), + ctx->br()->device_mr() + ); + } + }; + while (!ch_out->is_shutdown()) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = to_device(ctx, msg.release()); + co_await ch_out->send(to_message( + msg.sequence_number(), + std::make_unique(make_table(chunk), chunk.stream()) + )); + } + co_await ch_out->drain(ctx->executor()); +} +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/sort.hpp b/cpp/benchmarks/streaming/ndsh/sort.hpp new file mode 100644 index 000000000..9ef5ad655 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/sort.hpp @@ -0,0 +1,39 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include + +#include + +#include +#include + +namespace rapidsmpf::ndsh { + +/** + * @brief Sort chunks in a channel + * + * @param ctx Streaming context + * @param ch_in Input channel of `TableChunk`s + * @param ch_out Output channel of sorted `TableChunk`s + * @param keys Indices of key columns in the input channel + * @param values Indices of value columns in the input channel + * @param order Sort order for each column named in `keys` + * @param null_order Null precedence for each column named in `keys` + * + * @return Coroutine representing the sort + */ +[[nodiscard]] rapidsmpf::streaming::Node chunkwise_sort_by( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::vector values, + std::vector order, + std::vector null_order +); +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 1d67ff82d..4efe1c527 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -33,7 +33,7 @@ namespace rapidsmpf::ndsh { namespace detail { -std::vector list_parquet_files(std::string const& root_path) { +std::vector list_parquet_files(std::string const root_path) { auto root_entry = std::filesystem::directory_entry(std::filesystem::path(root_path)); RAPIDSMPF_EXPECTS( root_entry.exists() @@ -59,8 +59,28 @@ std::vector list_parquet_files(std::string const& root_path) { return result; } +std::string get_table_path( + std::string const& input_directory, std::string const& table_name +) { + auto dir = input_directory.empty() ? "." : input_directory; + auto file_path = dir + "/" + table_name + ".parquet"; + + if (std::filesystem::exists(file_path)) { + return file_path; + } + + return dir + "/" + table_name + "/"; +} + } // namespace detail +streaming::Node sink_channel( + std::shared_ptr ctx, std::shared_ptr ch +) { + co_await ctx->executor()->schedule(); + co_await ch->shutdown(); +} + streaming::TableChunk to_device( std::shared_ptr ctx, streaming::TableChunk&& chunk, diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index 4172735c4..a7e0cfc5c 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -12,6 +12,9 @@ #include #include +#include "rapidsmpf/streaming/core/channel.hpp" +#include "rapidsmpf/streaming/core/node.hpp" + namespace rapidsmpf::ndsh { namespace detail { @@ -28,9 +31,35 @@ namespace detail { * @throws std::runtime_error if the `root_path` doesn't name a regular file or a * directory. Or if it does name a regular file, but that file doesn't end in `.parquet`. */ -[[nodiscard]] std::vector list_parquet_files(std::string const& root_path); +[[nodiscard]] std::vector list_parquet_files(std::string const root_path); + +/** + * @brief Get the path to a given table + * + * @param input_directory Input directory + * @param table_name Name of table to find. + * + * @return Path to given table. + */ +[[nodiscard]] std::string get_table_path( + std::string const& input_directory, std::string const& table_name +); + + } // namespace detail +/** + * @brief Sink messages into a channel and discard them. + * + * @param ctx Streaming context + * @param ch Channel to discard messages from. + * + * @return Coroutine representing the shutdown and discard of the channel. + */ +[[nodiscard]] streaming::Node sink_channel( + std::shared_ptr ctx, std::shared_ptr ch +); + /** * @brief Ensure a `TableChunk` is on device. * @@ -99,5 +128,4 @@ std::shared_ptr create_context( ProgramOptions& arguments, RmmResourceAdaptor* mr ); - } // namespace rapidsmpf::ndsh From ed8890ab2849c27ca6905681f71f97465019028c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Dec 2025 18:02:25 +0000 Subject: [PATCH 13/75] Fix some bugs --- cpp/benchmarks/streaming/ndsh/q01.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index 3bda2f98c..d6d3551c0 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -154,7 +154,8 @@ rapidsmpf::streaming::Node postprocess_group_by( auto chunk = rapidsmpf::ndsh::to_device(ctx, msg.release()); auto stream = chunk.stream(); - auto columns = cudf::table{chunk.table_view()}.release(); + auto columns = + cudf::table{chunk.table_view(), stream, ctx->br()->device_mr()}.release(); std::ignore = std::move(chunk); auto count = std::move(columns.back()); columns.pop_back(); From cec0ef0376316305c1623848a12992801391b402 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Dec 2025 18:19:30 +0000 Subject: [PATCH 14/75] Use utils in q3 too --- cpp/benchmarks/streaming/ndsh/q01.cpp | 24 --- cpp/benchmarks/streaming/ndsh/q03.cpp | 219 +++++++----------------- cpp/benchmarks/streaming/ndsh/utils.cpp | 23 +++ cpp/benchmarks/streaming/ndsh/utils.hpp | 15 ++ 4 files changed, 101 insertions(+), 180 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index d6d3551c0..3ed01999f 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -282,30 +282,6 @@ static __device__ void calculate_charge(double *charge, double discprice, double } co_await ch_out->drain(ctx->executor()); } - -[[maybe_unused]] rapidsmpf::streaming::Node consume( - [[maybe_unused]] std::shared_ptr ctx, - std::shared_ptr ch_in -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in}; - co_await ctx->executor()->schedule(); - while (true) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - break; - } - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - ctx->comm()->logger().print( - "Consumed chunk with ", - chunk.table_view().num_rows(), - " rows and ", - chunk.table_view().num_columns(), - " columns" - ); - } -} } // namespace /** diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 170e28534..dc66d609d 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -45,25 +44,13 @@ #include #include -#include "concatenate.hpp" +#include "groupby.hpp" #include "join.hpp" +#include "parquet_writer.hpp" #include "utils.hpp" namespace { -std::string get_table_path( - std::string const& input_directory, std::string const& table_name -) { - auto dir = input_directory.empty() ? "." : input_directory; - auto file_path = dir + "/" + table_name + ".parquet"; - - if (std::filesystem::exists(file_path)) { - return file_path; - } - - return dir + "/" + table_name + "/"; -} - rapidsmpf::streaming::Node read_customer( std::shared_ptr ctx, std::shared_ptr ch_out, @@ -72,7 +59,7 @@ rapidsmpf::streaming::Node read_customer( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "customer") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "customer") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"c_custkey"}) // 0 @@ -119,7 +106,7 @@ rapidsmpf::streaming::Node read_customer( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "lineitem") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "lineitem") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({ @@ -183,7 +170,7 @@ rapidsmpf::streaming::Node read_orders( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "orders") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "orders") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({ @@ -240,58 +227,13 @@ rapidsmpf::streaming::Node read_orders( ); } -// In: [o_orderkey, o_orderdate, o_shippriority, revenue] -[[maybe_unused]] rapidsmpf::streaming::Node chunkwise_groupby_agg( - [[maybe_unused]] std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - std::vector partial_results; - std::uint64_t sequence = 0; - co_await ctx->executor()->schedule(); - ctx->comm()->logger().print("Chunkwise groupby"); - while (true) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - break; - } - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - - auto grouper = cudf::groupby::groupby( - // group by [o_orderkey, o_orderdate, o_shippriority] - table.select({0, 1, 2}), - cudf::null_policy::EXCLUDE, - cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - sequence++, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); +std::vector chunkwise_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + // sum(revenue) + aggs.emplace_back(cudf::make_sum_aggregation); + requests.emplace_back(3, std::move(aggs)); + return requests; } [[maybe_unused]] rapidsmpf::streaming::Node final_groupby_agg( @@ -482,11 +424,12 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl } // take first 10 rows -[[maybe_unused]] rapidsmpf::streaming::Node top_k( +[[maybe_unused]] rapidsmpf::streaming::Node top_k_by( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out, std::vector keys, + std::vector values, std::vector order, cudf::size_type k ) { @@ -512,7 +455,7 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl ); partials.push_back( cudf::gather( - chunk.table_view(), + chunk.table_view().select(values), cudf::split(indices->view(), {k}, chunk.stream()).front(), cudf::out_of_bounds_policy::DONT_CHECK, chunk.stream(), @@ -522,7 +465,7 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl chunk_streams.push_back(chunk.stream()); } - // TODO: + // TODO: multi-node auto out_stream = chunk_streams.front(); rapidsmpf::CudaEvent event; rapidsmpf::cuda_stream_join( @@ -545,66 +488,6 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl ); co_await ch_out->drain(ctx->executor()); } - -// In: o_orderkey, o_orderdate, o_shippriority, revenue -[[maybe_unused]] rapidsmpf::streaming::Node write_parquet( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::string output_path -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - co_return; - } - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto sink = cudf::io::sink_info(output_path); - // orderkey, revenue, orderdate, shippriority - auto table = chunk.table_view().select({0, 3, 1, 2}); - auto builder = cudf::io::parquet_writer_options::builder(sink, table); - auto metadata = cudf::io::table_input_metadata(table); - metadata.column_metadata[0].set_name("l_orderkey"); - metadata.column_metadata[1].set_name("revenue"); - metadata.column_metadata[2].set_name("o_orderdate"); - metadata.column_metadata[3].set_name("o_shippriority"); - builder = builder.metadata(metadata); - auto options = builder.build(); - cudf::io::write_parquet(options, chunk.stream()); - ctx->comm()->logger().print( - "Wrote chunk with ", - chunk.table_view().num_rows(), - " rows and ", - chunk.table_view().num_columns(), - " columns to ", - output_path - ); -} - -[[maybe_unused]] rapidsmpf::streaming::Node consume( - [[maybe_unused]] std::shared_ptr ctx, - std::shared_ptr ch_in -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in}; - co_await ctx->executor()->schedule(); - while (true) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - break; - } - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - ctx->comm()->logger().print( - "Consumed chunk with ", - chunk.table_view().num_rows(), - " rows and ", - chunk.table_view().num_columns(), - " columns" - ); - } -} } // namespace /** @@ -717,37 +600,61 @@ int main(int argc, char** argv) { auto chunkwise_groupby_output = ctx->create_channel(); // Out: o_orderkey, o_orderdate, o_shippriority, revenue nodes.push_back( - chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + groupby_input, + chunkwise_groupby_output, + {0, 1, 2}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE + ) ); - auto concatenated_groupby_output = ctx->create_channel(); + auto final_groupby_input = ctx->create_channel(); nodes.push_back( - rapidsmpf::ndsh::concatenate( + rapidsmpf::ndsh::broadcast( ctx, chunkwise_groupby_output, - concatenated_groupby_output, - rapidsmpf::ndsh::ConcatOrder::DONT_CARE + final_groupby_input, + static_cast(10 * i + op_id++), + rapidsmpf::streaming::AllGather::Ordered::NO ) ); - auto groupby_output = ctx->create_channel(); + auto final_groupby_output = ctx->create_channel(); // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back(final_groupby_agg( - ctx, - concatenated_groupby_output, - groupby_output, - static_cast(10 * i + op_id++) - )); - auto topk = ctx->create_channel(); - // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back(top_k( - ctx, - groupby_output, - topk, - {3, 1}, - {cudf::order::DESCENDING, cudf::order::ASCENDING}, - 10 - )); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0, 1, 2}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE - nodes.push_back(write_parquet(ctx, topk, output_path)); + ) + ); + auto topk = ctx->create_channel(); + if (ctx->comm()->rank() == 0) { + // Out: o_orderkey, revenue, o_orderdate, o_shippriority + nodes.push_back(top_k_by( + ctx, + final_groupby_output, + topk, + {3, 1}, + {0, 3, 1, 2}, + {cudf::order::DESCENDING, cudf::order::ASCENDING}, + 10 + )); + nodes.push_back( + rapidsmpf::ndsh::write_parquet( + ctx, + topk, + cudf::io::sink_info(output_path), + {"l_orderkey", "revenue", "o_orderdate", "o_shippriority"} + ) + ); + } else { + nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_output)); + } } auto end = std::chrono::steady_clock::now(); std::chrono::duration pipeline = end - start; diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 4efe1c527..29fb04f4a 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -81,6 +81,29 @@ streaming::Node sink_channel( co_await ch->shutdown(); } +streaming::Node consume_channel( + std::shared_ptr ctx, std::shared_ptr ch_in +) { + streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + if (msg.holds()) { + auto chunk = to_device(ctx, msg.release()); + ctx->comm()->logger().print( + "Consumed chunk with ", + chunk.table_view().num_rows(), + " rows and ", + chunk.table_view().num_columns(), + " columns" + ); + } + } +} + streaming::TableChunk to_device( std::shared_ptr ctx, streaming::TableChunk&& chunk, diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index a7e0cfc5c..d4f10d4b8 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -60,6 +60,21 @@ namespace detail { std::shared_ptr ctx, std::shared_ptr ch ); +/** + * @brief Consume messages from a channel and discard them. + * + * @param ctx Streaming context + * @param ch Channel to consume messages from. + * + * @note If the channel contains `TableChunk`s, moves them to device and prints small + * amount of detail about them (row and column count). + * + * @return Coroutine representing consuming and discarding messages in channel. + */ +[[nodiscard]] streaming::Node consume_channel( + std::shared_ptr ctx, std::shared_ptr ch_in +); + /** * @brief Ensure a `TableChunk` is on device. * From a7c17dab57f2262c2b69f7f7eb1eb249f5b1f04d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Dec 2025 18:22:34 +0000 Subject: [PATCH 15/75] TODO --- cpp/benchmarks/streaming/ndsh/q03.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index dc66d609d..d19c05ce8 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -579,6 +579,7 @@ int main(int argc, char** argv) { // join o_orderkey = l_orderkey // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, // l_discount + // TODO: shuffle join option. nodes.push_back( rapidsmpf::ndsh::inner_join_broadcast( ctx, From 96a48842d2798b82d15757ba7af31f1c65634d71 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 8 Dec 2025 19:07:11 +0000 Subject: [PATCH 16/75] WIP: bloom filter --- cmake/thirdparty/get_cuco.cmake | 14 ++ cpp/CMakeLists.txt | 3 +- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 2 +- .../streaming/ndsh/bloom_filter.cpp | 133 ++++++++++++++++++ .../streaming/ndsh/bloom_filter.hpp | 58 ++++++++ .../streaming/ndsh/bloom_filter_impl.cu | 95 +++++++++++++ .../streaming/ndsh/bloom_filter_impl.hpp | 133 ++++++++++++++++++ cpp/benchmarks/streaming/ndsh/q03.cpp | 97 ++++++++++++- 8 files changed, 530 insertions(+), 5 deletions(-) create mode 100644 cmake/thirdparty/get_cuco.cmake create mode 100644 cpp/benchmarks/streaming/ndsh/bloom_filter.cpp create mode 100644 cpp/benchmarks/streaming/ndsh/bloom_filter.hpp create mode 100644 cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu create mode 100644 cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp diff --git a/cmake/thirdparty/get_cuco.cmake b/cmake/thirdparty/get_cuco.cmake new file mode 100644 index 000000000..1b0b6f7f9 --- /dev/null +++ b/cmake/thirdparty/get_cuco.cmake @@ -0,0 +1,14 @@ +# ============================================================================= +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# ============================================================================= + +function(find_and_configure_cucollections) + include(${rapids-cmake-dir}/cpm/cuco.cmake) + + rapids_cpm_cuco(BUILD_EXPORT_SET rapidsmpf-exports INSTALL_EXPORT_SET rapidsmpf-exports) +endfunction() + +find_and_configure_cucollections() diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 97f7c074f..ffbd4017a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -124,6 +124,7 @@ include(../cmake/thirdparty/get_cccl_cudax.cmake) include(../cmake/thirdparty/get_nvtx.cmake) include(../cmake/thirdparty/get_rmm.cmake) include(../cmake/thirdparty/get_cudf.cmake) +include(../cmake/thirdparty/get_cuco.cmake) if(RAPIDSMPF_HAVE_UCXX) rapids_find_package( ucxx REQUIRED @@ -272,7 +273,7 @@ endif() target_link_libraries( rapidsmpf - PUBLIC rmm::rmm cudf::cudf CCCL::CCCL $ + PUBLIC rmm::rmm cudf::cudf CCCL::CCCL cuco::cuco $ $ PRIVATE $<$:numa> $ diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index f5e53096c..89a72589c 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -13,7 +13,7 @@ if(NOT RAPIDSMPF_HAVE_STREAMING) message(FATAL_ERROR "Streaming NDSH benchmarks require streaming support") endif() -add_library(rapidsmpfndsh concatenate.cpp groupby.cpp join.cpp parquet_writer.cpp sort.cpp utils.cpp) +add_library(rapidsmpfndsh bloom_filter.cpp bloom_filter_impl.cu concatenate.cpp groupby.cpp join.cpp parquet_writer.cpp sort.cpp utils.cpp) set_target_properties( rapidsmpfndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp new file mode 100644 index 000000000..cc622556c --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -0,0 +1,133 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bloom_filter_impl.hpp" +#include "cuda_runtime_api.h" +#include "utils.hpp" + +namespace rapidsmpf::ndsh { +streaming::Node build_bloom_filter( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + OpID tag, + std::uint64_t seed +) { + streaming::ShutdownAtExit c{ch_in, ch_out}; + auto mr = ctx->br()->device_mr(); + auto stream = ctx->br()->stream_pool().get_stream(); + CudaEvent event; + constexpr std::size_t num_blocks = 8; + auto storage = create_filter_storage(num_blocks, stream, mr); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = to_device(ctx, msg.release()); + cuda_stream_join(stream, chunk.stream(), &event); + update_filter(storage, num_blocks, chunk.table_view(), seed, chunk.stream(), mr); + cuda_stream_join(chunk.stream(), stream, &event); + } + + auto allgather = streaming::AllGather(ctx, tag); + auto metadata = std::vector(storage.size); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + metadata.data(), storage.data, storage.size, cudaMemcpyDefault, stream.value() + )); + stream.synchronize(); + allgather.insert( + 0, {std::make_unique>(std::move(metadata)), nullptr} + ); + allgather.insert_finished(); + auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); + auto merged = std::make_unique>(storage.size); + for (auto&& data : per_rank) { + for (std::size_t i = 0; i < storage.size; i++) { + (*merged)[i] |= static_cast((*data.metadata)[i]); + } + } + co_await ch_out->send(streaming::Message{0, std::move(merged), {}, {}}); + co_await ch_out->drain(ctx->executor()); +} + +streaming::Node apply_bloom_filter( + std::shared_ptr ctx, + std::shared_ptr bloom_filter, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::uint64_t seed +) { + streaming::ShutdownAtExit c{bloom_filter, ch_in, ch_out}; + co_await ctx->executor()->schedule(); + auto data = co_await bloom_filter->receive(); + RAPIDSMPF_EXPECTS(!data.empty(), "Bloom filter channel was shutdown"); + constexpr std::size_t num_blocks = 8; + auto stream = ctx->br()->stream_pool().get_stream(); + auto storage = create_filter_storage(num_blocks, stream, ctx->br()->device_mr()); + CudaEvent event; + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + storage.data, + data.get>().data(), + storage.size, + cudaMemcpyDefault, + stream + )); + while (!ch_out->is_shutdown()) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = to_device(ctx, msg.release()); + auto chunk_stream = chunk.stream(); + cuda_stream_join(stream, chunk_stream, &event); + auto mask = apply_filter( + storage, + num_blocks, + chunk.table_view().select(keys), + seed, + chunk_stream, + ctx->br()->device_mr() + ); + cuda_stream_join(chunk_stream, stream, &event); + RAPIDSMPF_EXPECTS( + mask.size() == static_cast(chunk.table_view().num_rows()), + "Invalid mask size" + ); + auto result = cudf::apply_boolean_mask( + chunk.table_view(), + cudf::column_view{ + cudf::data_type{cudf::type_id::BOOL8}, + static_cast(mask.size()), + mask.data(), + {}, + 0 + }, + chunk_stream, + ctx->br()->device_mr() + ); + std::ignore = std::move(chunk); + co_await ch_out->send(to_message( + msg.sequence_number(), + std::make_unique(std::move(result), chunk_stream) + )); + } + co_await ch_out->drain(ctx->executor()); +} +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp new file mode 100644 index 000000000..87ef0e58b --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp @@ -0,0 +1,58 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include + +#include + +#include +#include +#include +#include + +namespace rapidsmpf::ndsh { + +/** + * @brief Build a bloom filter of the input channel. + * + * @param ctx Streaming context. + * @param ch_in Input channel of `TableChunk`s to build bloom filter for. + * @param ch_out Output channel receiving a single message containing the bloom filter. + * @param tag Disambiguating tag to combine filters across ranks. + * @param seed Hash seed for hashing the keys. + * + * @return Coroutine representing the construction of the bloom filter. + */ +[[maybe_unused]] streaming::Node build_bloom_filter( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + OpID tag, + std::uint64_t seed +); + +/** + * @brief Apply a bloom filter to an input channel. + * + * @param ctx Streaming context. + * @param bloom_filter Channel containing the bloom filter (a single message). + * @param ch_in Input channel of `TableChunk`s to apply bloom filter to. + * @param ch_out Output channel receiving filtered `TableChunk`s. + * @param keys Indices selecting the key columns for the hash fingerprint + * @param seed Hash seed for hashing the keys. + * + * @return Coroutine representing the application of the bloom filter. + */ +streaming::Node apply_bloom_filter( + std::shared_ptr ctx, + std::shared_ptr bloom_filter, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector keys, + std::uint64_t seed +); +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu new file mode 100644 index 000000000..ed98369d2 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -0,0 +1,95 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "bloom_filter_impl.hpp" + +namespace rapidsmpf::ndsh { + +using policy_type = + cuco::default_filter_policy, std::uint32_t, 8>; +using bloom_filter = cuco::bloom_filter< + std::uint64_t, + cuco::extent, + cuda::thread_scope_device, + policy_type, + rmm::mr::polymorphic_allocator>; + +using bloom_filter_ref = bloom_filter::ref_type; + +aligned_buffer create_filter_storage( + std::size_t num_blocks, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +) { + using type = bloom_filter_ref::filter_block_type; + return aligned_buffer{ + num_blocks * sizeof(type), std::alignment_of_v, stream, mr + }; +} + +void update_filter( + aligned_buffer& storage, + std::size_t num_blocks, + cudf::table_view const& values_to_hash, + std::uint64_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +) { + auto policy = policy_type{}; + auto filter_ref = bloom_filter_ref( + static_cast(storage.data), + num_blocks, + cuco::thread_scope_device, + policy + ); + auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed, stream, mr); + auto view = hashes->view(); + filter_ref.add_async(view.begin(), view.end(), stream); +} + +rmm::device_uvector apply_filter( + aligned_buffer& storage, + std::size_t num_blocks, + cudf::table_view const& values_to_hash, + std::uint64_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +) { + auto policy = policy_type{}; + auto filter_ref = bloom_filter_ref( + static_cast(storage.data), + num_blocks, + cuco::thread_scope_device, + policy + ); + auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed, stream, mr); + auto view = hashes->view(); + rmm::device_uvector result(static_cast(view.size()), stream, mr); + filter_ref.contains_async( + view.begin(), view.end(), result.begin(), stream + ); + return result; +} +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp new file mode 100644 index 000000000..e503f1b23 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp @@ -0,0 +1,133 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include +#include + +#include +#include +#include + +#include + +namespace rapidsmpf::ndsh { +/** + * @brief A type-erased buffer with an allocation with specified alignment. + */ +struct aligned_buffer { + /** + * @brief Construct the buffer. + * + * @param size The buffer size. + * @param alignment The requested alignment. + * @param stream Stream for allocations. + * @param mr Memory resource for allocations. + */ + explicit aligned_buffer( + std::size_t size, + std::size_t alignment, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr + ) + : size{size}, + alignment{alignment}, + stream{stream}, + mr{mr}, + data{mr.allocate(stream, size, alignment)} {} + + /** + * @brief Deallocate the buffer. + */ + ~aligned_buffer() { + mr.deallocate(stream, data, size, alignment); + } + + aligned_buffer(aligned_buffer const&) = delete; + aligned_buffer& operator=(aligned_buffer const&) = delete; + + aligned_buffer(aligned_buffer&& other) + : size{other.size}, + alignment{other.alignment}, + stream{other.stream}, + mr{other.mr}, + data{std::exchange(other.data, nullptr)} {} + + aligned_buffer& operator=(aligned_buffer&& other) { + if (this != &other) { + RAPIDSMPF_EXPECTS( + !data, + "cannot move into an already initialized aligned_buffer", + std::invalid_argument + ); + } + size = other.size; + alignment = other.alignment; + stream = other.stream; + mr = other.mr; + data = std::exchange(other.data, nullptr); + return *this; + } + + std::size_t size; + std::size_t alignment; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + void* data; +}; + +/** + * @brief Create device storage for the bloom filter. + * + * @param num_blocks Number of blocks. + * @param stream CUDA stream for device launches and allocations. + * @param mr Memory resource for allocations. + */ +aligned_buffer create_filter_storage( + std::size_t num_blocks, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +); + +/** + * @brief Update the filter with fingerprints from a table. + * + * @param storage Allocated device storage for the bloom filter + * @param num_blocks Number of blocks. + * @param values_to_hash Table of values to hash. + * @param seed Hash seed + * @param stream CUDA stream for device launches and allocations. + * @param mr Memory resource for allocations. + */ +void update_filter( + aligned_buffer& storage, + std::size_t num_blocks, + cudf::table_view const& values_to_hash, + std::uint64_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +); + +/** + * @brief Apply the filter to fingerprints from a table. + * + * @param storage Allocated device storage for the bloom filter + * @param num_blocks Number of blocks. + * @param values_to_hash Table of values to hash. + * @param seed Hash seed + * @param stream CUDA stream for device launches and allocations. + * @param mr Memory resource for allocations. + * + * @return Mask vector select rows in the table that were selected by the filter. + */ +rmm::device_uvector apply_filter( + aligned_buffer& storage, + std::size_t num_blocks, + cudf::table_view const& values_to_hash, + std::uint64_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +); +} // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index d19c05ce8..e02aa8f35 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ #include #include +#include "bloom_filter.hpp" #include "groupby.hpp" #include "join.hpp" #include "parquet_writer.hpp" @@ -488,6 +490,70 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl ); co_await ch_out->drain(ctx->executor()); } + +[[maybe_unused]] +rapidsmpf::streaming::Node fanout_bounded( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch1_out, + std::vector ch1_cols, + std::shared_ptr ch2_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch1_out, ch2_out}; + + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + // Here, we know that copying ch1_cols (a single col) is better than copying + // ch2_cols (the whole table) + std::vector> tasks; + if (!ch1_out->is_shutdown()) { + auto msg1 = rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::make_unique( + chunk.table_view().select(ch1_cols), + chunk.stream(), + ctx->br()->device_mr() + ), + chunk.stream() + ) + ); + tasks.push_back(ch1_out->send(std::move(msg1))); + } + if (!ch2_out->is_shutdown()) { + // TODO: We know here that ch2 wants the whole table. + tasks.push_back(ch2_out->send( + rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique(std::move(chunk)) + ) + )); + } + if (!std::ranges::any_of( + rapidsmpf::streaming::coro_results( + co_await coro::when_all(std::move(tasks)) + ), + std::identity{} + )) + { + ctx->comm()->logger().print("Breaking after ", msg.sequence_number()); + break; + }; + } + + rapidsmpf::streaming::coro_results( + co_await coro::when_all( + ch1_out->drain(ctx->executor()), ch2_out->drain(ctx->executor()) + ) + ); +} } // namespace /** @@ -567,6 +633,21 @@ int main(int argc, char** argv) { rapidsmpf::ndsh::KeepKeys::NO ) ); + auto bloom_filter_input = ctx->create_channel(); + auto bloom_filter_output = ctx->create_channel(); + auto customer_x_orders_input = ctx->create_channel(); + nodes.push_back(fanout_bounded( + ctx, customer_x_orders, bloom_filter_input, {0}, customer_x_orders_input + )); + nodes.push_back( + rapidsmpf::ndsh::build_bloom_filter( + ctx, + bloom_filter_input, + bloom_filter_output, + static_cast(10 * i + op_id++), + cudf::DEFAULT_HASH_SEED + ) + ); // Out: l_orderkey, l_extendedprice, l_discount nodes.push_back(read_lineitem( ctx, @@ -575,7 +656,17 @@ int main(int argc, char** argv) { arguments.num_rows_per_chunk, arguments.input_directory )); - + auto lineitem_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::apply_bloom_filter( + ctx, + bloom_filter_output, + lineitem, + lineitem_output, + {0}, + cudf::DEFAULT_HASH_SEED + ) + ); // join o_orderkey = l_orderkey // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, // l_discount @@ -583,8 +674,8 @@ int main(int argc, char** argv) { nodes.push_back( rapidsmpf::ndsh::inner_join_broadcast( ctx, - customer_x_orders, - lineitem, + customer_x_orders_input, + lineitem_output, customer_x_orders_x_lineitem, {0}, {0}, From 9558a0a249115287e0a470988b4f43bba7960dc2 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 11:23:53 +0000 Subject: [PATCH 17/75] Bloom filter updates --- .../streaming/ndsh/bloom_filter.cpp | 55 ++++++++++++------- .../streaming/ndsh/bloom_filter.hpp | 18 +++++- .../streaming/ndsh/bloom_filter_impl.cu | 46 ++++++++++------ 3 files changed, 78 insertions(+), 41 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index cc622556c..71d26e394 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -3,13 +3,19 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include "bloom_filter.hpp" + #include #include +#include +#include +#include #include #include #include +#include #include #include #include @@ -17,7 +23,6 @@ #include #include "bloom_filter_impl.hpp" -#include "cuda_runtime_api.h" #include "utils.hpp" namespace rapidsmpf::ndsh { @@ -26,13 +31,13 @@ streaming::Node build_bloom_filter( std::shared_ptr ch_in, std::shared_ptr ch_out, OpID tag, - std::uint64_t seed + std::uint64_t seed, + std::size_t num_blocks ) { streaming::ShutdownAtExit c{ch_in, ch_out}; auto mr = ctx->br()->device_mr(); auto stream = ctx->br()->stream_pool().get_stream(); CudaEvent event; - constexpr std::size_t num_blocks = 8; auto storage = create_filter_storage(num_blocks, stream, mr); while (true) { auto msg = co_await ch_in->receive(); @@ -40,9 +45,10 @@ streaming::Node build_bloom_filter( break; } auto chunk = to_device(ctx, msg.release()); - cuda_stream_join(stream, chunk.stream(), &event); - update_filter(storage, num_blocks, chunk.table_view(), seed, chunk.stream(), mr); + ctx->comm()->logger().print((int)chunk.table_view().column(0).type().id()); cuda_stream_join(chunk.stream(), stream, &event); + update_filter(storage, num_blocks, chunk.table_view(), seed, chunk.stream(), mr); + cuda_stream_join(stream, chunk.stream(), &event); } auto allgather = streaming::AllGather(ctx, tag); @@ -51,8 +57,11 @@ streaming::Node build_bloom_filter( metadata.data(), storage.data, storage.size, cudaMemcpyDefault, stream.value() )); stream.synchronize(); + auto [res, _] = ctx->br()->reserve(MemoryType::HOST, 0, true); allgather.insert( - 0, {std::make_unique>(std::move(metadata)), nullptr} + 0, + {std::make_unique>(std::move(metadata)), + ctx->br()->allocate(stream, std::move(res))} ); allgather.insert_finished(); auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); @@ -72,19 +81,19 @@ streaming::Node apply_bloom_filter( std::shared_ptr ch_in, std::shared_ptr ch_out, std::vector keys, - std::uint64_t seed + std::uint64_t seed, + std::size_t num_blocks ) { streaming::ShutdownAtExit c{bloom_filter, ch_in, ch_out}; co_await ctx->executor()->schedule(); auto data = co_await bloom_filter->receive(); RAPIDSMPF_EXPECTS(!data.empty(), "Bloom filter channel was shutdown"); - constexpr std::size_t num_blocks = 8; auto stream = ctx->br()->stream_pool().get_stream(); auto storage = create_filter_storage(num_blocks, stream, ctx->br()->device_mr()); CudaEvent event; RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( storage.data, - data.get>().data(), + data.get>().data(), storage.size, cudaMemcpyDefault, stream @@ -96,7 +105,7 @@ streaming::Node apply_bloom_filter( } auto chunk = to_device(ctx, msg.release()); auto chunk_stream = chunk.stream(); - cuda_stream_join(stream, chunk_stream, &event); + cuda_stream_join(chunk_stream, stream, &event); auto mask = apply_filter( storage, num_blocks, @@ -105,22 +114,26 @@ streaming::Node apply_bloom_filter( chunk_stream, ctx->br()->device_mr() ); - cuda_stream_join(chunk_stream, stream, &event); + cuda_stream_join(stream, chunk_stream, &event); RAPIDSMPF_EXPECTS( mask.size() == static_cast(chunk.table_view().num_rows()), "Invalid mask size" ); + auto mask_view = cudf::column_view{ + cudf::data_type{cudf::type_id::BOOL8}, + static_cast(mask.size()), + mask.data(), + {}, + 0 + }; auto result = cudf::apply_boolean_mask( - chunk.table_view(), - cudf::column_view{ - cudf::data_type{cudf::type_id::BOOL8}, - static_cast(mask.size()), - mask.data(), - {}, - 0 - }, - chunk_stream, - ctx->br()->device_mr() + chunk.table_view(), mask_view, chunk_stream, ctx->br()->device_mr() + ); + ctx->comm()->logger().print( + "Sending filtered chunk ", + result->num_rows(), + " before ", + chunk.table_view().num_rows() ); std::ignore = std::move(chunk); co_await ch_out->send(to_message( diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp index 87ef0e58b..ca0218f58 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp @@ -24,6 +24,7 @@ namespace rapidsmpf::ndsh { * @param ch_out Output channel receiving a single message containing the bloom filter. * @param tag Disambiguating tag to combine filters across ranks. * @param seed Hash seed for hashing the keys. + * @param num_filter_blocks Number of blocks in the filter. * * @return Coroutine representing the construction of the bloom filter. */ @@ -32,7 +33,8 @@ namespace rapidsmpf::ndsh { std::shared_ptr ch_in, std::shared_ptr ch_out, OpID tag, - std::uint64_t seed + std::uint64_t seed, + std::size_t num_filter_blocks ); /** @@ -44,6 +46,7 @@ namespace rapidsmpf::ndsh { * @param ch_out Output channel receiving filtered `TableChunk`s. * @param keys Indices selecting the key columns for the hash fingerprint * @param seed Hash seed for hashing the keys. + * @param num_filter_blocks Number of blocks in the filter. * * @return Coroutine representing the application of the bloom filter. */ @@ -53,6 +56,17 @@ streaming::Node apply_bloom_filter( std::shared_ptr ch_in, std::shared_ptr ch_out, std::vector keys, - std::uint64_t seed + std::uint64_t seed, + std::size_t num_filter_blocks ); + +/** + * @brief Return number of filter blocks for bloom filter given an L2 cache size + * + * @param l2cachesize Size of L2 cache in bytes + * + * @return Number of blocks to use. + */ +std::size_t num_filter_blocks(int l2cachesize); + } // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu index ed98369d2..87caa6115 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -3,6 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include #include #include @@ -19,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,25 +29,26 @@ namespace rapidsmpf::ndsh { -using policy_type = - cuco::default_filter_policy, std::uint32_t, 8>; -using bloom_filter = cuco::bloom_filter< +using KeyType = std::uint64_t; + +using PolicyType = cuco::arrow_filter_policy; +using BloomFilter = cuco::bloom_filter< std::uint64_t, cuco::extent, cuda::thread_scope_device, - policy_type, + PolicyType, rmm::mr::polymorphic_allocator>; -using bloom_filter_ref = bloom_filter::ref_type; +using BloomFilterRef = BloomFilter::ref_type; +using StorageType = BloomFilterRef::filter_block_type; aligned_buffer create_filter_storage( std::size_t num_blocks, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - using type = bloom_filter_ref::filter_block_type; return aligned_buffer{ - num_blocks * sizeof(type), std::alignment_of_v, stream, mr + num_blocks * sizeof(StorageType), std::alignment_of_v, stream, mr }; } @@ -57,16 +60,19 @@ void update_filter( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - auto policy = policy_type{}; - auto filter_ref = bloom_filter_ref( - static_cast(storage.data), + auto filter_ref = BloomFilterRef{ + static_cast(storage.data), num_blocks, cuco::thread_scope_device, - policy - ); + PolicyType{} + }; auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed, stream, mr); auto view = hashes->view(); - filter_ref.add_async(view.begin(), view.end(), stream); + RAPIDSMPF_EXPECTS( + view.type().id() == cudf::type_to_id(), + "Hash values do not have correct type" + ); + filter_ref.add_async(view.begin(), view.end(), stream); } rmm::device_uvector apply_filter( @@ -77,19 +83,23 @@ rmm::device_uvector apply_filter( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - auto policy = policy_type{}; - auto filter_ref = bloom_filter_ref( - static_cast(storage.data), + auto policy = PolicyType{}; + auto filter_ref = BloomFilterRef{ + static_cast(storage.data), num_blocks, cuco::thread_scope_device, policy - ); + }; auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed, stream, mr); auto view = hashes->view(); rmm::device_uvector result(static_cast(view.size()), stream, mr); filter_ref.contains_async( - view.begin(), view.end(), result.begin(), stream + view.begin(), view.end(), result.begin(), stream ); return result; } + +std::size_t num_filter_blocks(int l2cachesize) { + return (static_cast(l2cachesize) * 2) / (3 * sizeof(StorageType)); +} } // namespace rapidsmpf::ndsh From d9a604634578a56f983c2666b74e09c41946dc3c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 11:24:15 +0000 Subject: [PATCH 18/75] Shuffle join option and bloom filter in q3 --- cpp/benchmarks/streaming/ndsh/q03.cpp | 193 +++++++++----------------- 1 file changed, 63 insertions(+), 130 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index e02aa8f35..5c02657d4 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -100,7 +101,7 @@ rapidsmpf::streaming::Node read_customer( ); } -[[maybe_unused]] rapidsmpf::streaming::Node read_lineitem( +rapidsmpf::streaming::Node read_lineitem( std::shared_ptr ctx, std::shared_ptr ch_out, std::size_t num_producers, @@ -238,120 +239,10 @@ std::vector chunkwise_groupby_requests() { return requests; } -[[maybe_unused]] rapidsmpf::streaming::Node final_groupby_agg( - [[maybe_unused]] std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out, - rapidsmpf::OpID tag -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - // TODO: requires concatenated input stream. - auto msg = co_await ch_in->receive(); - auto next = co_await ch_in->receive(); - ctx->comm()->logger().print("Final groupby"); - RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - std::unique_ptr local_result{nullptr}; - if (!table.is_empty()) { - auto grouper = cudf::groupby::groupby( - table.select({0, 1, 2}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(table.column(3), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - local_result = std::make_unique(std::move(result)); - } - if (ctx->comm()->nranks() > 1) { - // Reduce across ranks... - // Need a reduce primitive in rapidsmpf, but let's just use an allgather and - // discard for now. - rapidsmpf::streaming::AllGather gatherer{ctx, tag}; - if (local_result) { - auto pack = - cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); - gatherer.insert( - 0, - {rapidsmpf::PackedData( - std::move(pack.metadata), - ctx->br()->move(std::move(pack.gpu_data), chunk_stream) - )} - ); - } - gatherer.insert_finished(); - auto packed_data = - co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); - if (ctx->comm()->rank() == 0) { - auto global_result = rapidsmpf::unpack_and_concat( - rapidsmpf::unspill_partitions( - std::move(packed_data), ctx->br(), true, ctx->statistics() - ), - chunk_stream, - ctx->br(), - ctx->statistics() - ); - // We will only actually bother to do this on rank zero. - auto result_view = global_result->view(); - auto grouper = cudf::groupby::groupby( - result_view.select({0, 1, 2}), - cudf::null_policy::EXCLUDE, - cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(result_view.column(3), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - global_result.reset(); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } else { - std::ignore = std::move(packed_data); - } - } else { - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::move(local_result), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); -} - // In: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, l_discount // Out: o_orderkey, o_orderdate, o_shippriority, revenue = (l_extendedprice - (1 - // l_discount)) -[[maybe_unused]] rapidsmpf::streaming::Node select_columns_for_groupby( +rapidsmpf::streaming::Node select_columns_for_groupby( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out @@ -425,8 +316,7 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl co_await ch_out->drain(ctx->executor()); } -// take first 10 rows -[[maybe_unused]] rapidsmpf::streaming::Node top_k_by( +rapidsmpf::streaming::Node top_k_by( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out, @@ -594,6 +484,11 @@ int main(int argc, char** argv) { auto ctx = rapidsmpf::ndsh::create_context(arguments, &stats_wrapper); std::string output_path = arguments.output_file; std::vector timings; + int l2size; + int device; + RAPIDSMPF_CUDA_TRY(cudaGetDevice(&device)); + RAPIDSMPF_CUDA_TRY(cudaDeviceGetAttribute(&l2size, cudaDevAttrL2CacheSize, device)); + auto const num_filter_blocks = rapidsmpf::ndsh::num_filter_blocks(l2size); for (int i = 0; i < arguments.num_iterations; i++) { int op_id{0}; std::vector nodes; @@ -645,14 +540,15 @@ int main(int argc, char** argv) { bloom_filter_input, bloom_filter_output, static_cast(10 * i + op_id++), - cudf::DEFAULT_HASH_SEED + cudf::DEFAULT_HASH_SEED, + num_filter_blocks ) ); // Out: l_orderkey, l_extendedprice, l_discount nodes.push_back(read_lineitem( ctx, lineitem, - /* num_tickets */ 6, + /* num_tickets */ 4, arguments.num_rows_per_chunk, arguments.input_directory )); @@ -664,26 +560,63 @@ int main(int argc, char** argv) { lineitem, lineitem_output, {0}, - cudf::DEFAULT_HASH_SEED + cudf::DEFAULT_HASH_SEED, + num_filter_blocks ) ); // join o_orderkey = l_orderkey // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, // l_discount // TODO: shuffle join option. - nodes.push_back( - rapidsmpf::ndsh::inner_join_broadcast( - ctx, - customer_x_orders_input, - lineitem_output, - customer_x_orders_x_lineitem, - {0}, - {0}, - static_cast(10 * i + op_id++), - rapidsmpf::ndsh::KeepKeys::YES - ) - ); - + if (arguments.use_shuffle_join) { + auto lineitem_shuffled = ctx->create_channel(); + auto customer_x_orders_shuffled = ctx->create_channel(); + std::uint32_t num_partitions = 16; + nodes.push_back( + rapidsmpf::ndsh::shuffle( + ctx, + lineitem_output, + lineitem_shuffled, + {0}, + num_partitions, + static_cast(10 * i + op_id++) + ) + ); + nodes.push_back( + rapidsmpf::ndsh::shuffle( + ctx, + customer_x_orders_input, + customer_x_orders_shuffled, + {0}, + num_partitions, + static_cast(10 * i + op_id++) + ) + ); + nodes.push_back( + rapidsmpf::ndsh::inner_join_shuffle( + ctx, + customer_x_orders_shuffled, + lineitem_shuffled, + customer_x_orders_x_lineitem, + {0}, + {0}, + rapidsmpf::ndsh::KeepKeys::YES + ) + ); + } else { + nodes.push_back( + rapidsmpf::ndsh::inner_join_broadcast( + ctx, + customer_x_orders_input, + lineitem_output, + customer_x_orders_x_lineitem, + {0}, + {0}, + static_cast(10 * i + op_id++), + rapidsmpf::ndsh::KeepKeys::YES + ) + ); + } auto groupby_input = ctx->create_channel(); // Out: o_orderkey, o_orderdate, o_shippriority, revenue nodes.push_back(select_columns_for_groupby( From 2fd1255ad9961cb2199a3288696e187018fcdc0c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 14:39:30 +0000 Subject: [PATCH 19/75] More stuff --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 1 + cpp/benchmarks/streaming/ndsh/q01.cpp | 4 ++-- cpp/benchmarks/streaming/ndsh/q03.cpp | 17 +++++++---------- cpp/benchmarks/streaming/ndsh/utils.hpp | 17 +++++++++++++++++ cpp/src/allgather/allgather.cpp | 3 +++ 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 71d26e394..389d41894 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -39,6 +39,7 @@ streaming::Node build_bloom_filter( auto stream = ctx->br()->stream_pool().get_stream(); CudaEvent event; auto storage = create_filter_storage(num_blocks, stream, mr); + RAPIDSMPF_CUDA_TRY(cudaMemsetAsync(storage.data, 0, storage.size, stream)); while (true) { auto msg = co_await ch_in->receive(); if (msg.empty()) { diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index 3ed01999f..b6f7fffad 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -72,8 +72,8 @@ rapidsmpf::streaming::Node read_lineitem( ); auto sys_days = cuda::std::chrono::sys_days(date); owner->push_back( - std::make_shared>( - sys_days, true, stream + std::make_shared>( + sys_days.time_since_epoch(), true, stream ) ); owner->push_back( diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 5c02657d4..bf1692397 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -128,14 +128,14 @@ rapidsmpf::streaming::Node read_lineitem( ); auto sys_days = cuda::std::chrono::sys_days(date); owner->push_back( - std::make_shared>( - sys_days, true, stream + std::make_shared>( + sys_days.time_since_epoch(), true, stream ) ); owner->push_back( std::make_shared( *std::any_cast< - std::shared_ptr>>( + std::shared_ptr>>( owner->at(0) ) ) @@ -193,14 +193,14 @@ rapidsmpf::streaming::Node read_orders( ); auto sys_days = cuda::std::chrono::sys_days(date); owner->push_back( - std::make_shared>( - sys_days, true, stream + std::make_shared>( + sys_days.time_since_epoch(), true, stream ) ); owner->push_back( std::make_shared( *std::any_cast< - std::shared_ptr>>( + std::shared_ptr>>( owner->at(0) ) ) @@ -477,6 +477,7 @@ rapidsmpf::streaming::Node fanout_bounded( * @endcode{} */ int main(int argc, char** argv) { + rapidsmpf::ndsh::FinalizeMPI finalize{}; cudaFree(nullptr); auto mr = rmm::mr::cuda_async_memory_resource{}; auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); @@ -707,9 +708,5 @@ int main(int argc, char** argv) { ); } } - - if (rapidsmpf::mpi::is_initialized()) { - RAPIDSMPF_MPI(MPI_Finalize()); - } return 0; } diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index d4f10d4b8..9c7a6a674 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -9,9 +9,12 @@ #include #include +#include + #include #include +#include "rapidsmpf/communicator/mpi.hpp" #include "rapidsmpf/streaming/core/channel.hpp" #include "rapidsmpf/streaming/core/node.hpp" @@ -143,4 +146,18 @@ std::shared_ptr create_context( ProgramOptions& arguments, RmmResourceAdaptor* mr ); +/** + * @brief Finalize MPI when going out of scope. + */ +struct FinalizeMPI { + ~FinalizeMPI() noexcept { + if (rapidsmpf::mpi::is_initialized()) { + int flag; + RAPIDSMPF_MPI(MPI_Finalized(&flag)); + if (!flag) { + RAPIDSMPF_MPI(MPI_Finalize()); + } + } + } +}; } // namespace rapidsmpf::ndsh diff --git a/cpp/src/allgather/allgather.cpp b/cpp/src/allgather/allgather.cpp index 948f41434..6564ea283 100644 --- a/cpp/src/allgather/allgather.cpp +++ b/cpp/src/allgather/allgather.cpp @@ -18,6 +18,8 @@ #include #include +#include "rapidsmpf/nvtx.hpp" + namespace rapidsmpf::allgather { namespace detail { @@ -402,6 +404,7 @@ AllGather::AllGather( } ProgressThread::ProgressState AllGather::event_loop() { + RAPIDSMPF_NVTX_SCOPED_RANGE("AllGather::event_loop"); /* * Data flow: * User inserts into inserted_ From fafcce998a247bc4442c9ef491b3a4b01028917d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 14:41:24 +0000 Subject: [PATCH 20/75] Bloom filter ranges --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 1 - cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 389d41894..6c7b6e968 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -46,7 +46,6 @@ streaming::Node build_bloom_filter( break; } auto chunk = to_device(ctx, msg.release()); - ctx->comm()->logger().print((int)chunk.table_view().column(0).type().id()); cuda_stream_join(chunk.stream(), stream, &event); update_filter(storage, num_blocks, chunk.table_view(), seed, chunk.stream(), mr); cuda_stream_join(stream, chunk.stream(), &event); diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu index 87caa6115..f5dc5e58c 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -25,6 +25,8 @@ #include #include +#include + #include "bloom_filter_impl.hpp" namespace rapidsmpf::ndsh { @@ -60,6 +62,7 @@ void update_filter( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { + RAPIDSMPF_NVTX_FUNC_RANGE(); auto filter_ref = BloomFilterRef{ static_cast(storage.data), num_blocks, @@ -83,6 +86,7 @@ rmm::device_uvector apply_filter( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { + RAPIDSMPF_NVTX_FUNC_RANGE(); auto policy = PolicyType{}; auto filter_ref = BloomFilterRef{ static_cast(storage.data), From 1dce429984a9ad38e7ae196fc5e982027189ba5e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 16:19:28 +0000 Subject: [PATCH 21/75] Timing info? --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 6c7b6e968..9aa269292 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -35,22 +35,32 @@ streaming::Node build_bloom_filter( std::size_t num_blocks ) { streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); auto mr = ctx->br()->device_mr(); auto stream = ctx->br()->stream_pool().get_stream(); CudaEvent event; auto storage = create_filter_storage(num_blocks, stream, mr); RAPIDSMPF_CUDA_TRY(cudaMemsetAsync(storage.data, 0, storage.size, stream)); + CudaEvent storage_event; + storage_event.record(stream); + auto start = Clock::now(); + bool started = false; while (true) { auto msg = co_await ch_in->receive(); + if (!started) { + start = Clock::now(); + started = true; + } if (msg.empty()) { break; } auto chunk = to_device(ctx, msg.release()); - cuda_stream_join(chunk.stream(), stream, &event); + storage_event.stream_wait(chunk.stream()); update_filter(storage, num_blocks, chunk.table_view(), seed, chunk.stream(), mr); cuda_stream_join(stream, chunk.stream(), &event); } + ctx->comm()->logger().print("Bloom filter local build took ", Clock::now() - start); auto allgather = streaming::AllGather(ctx, tag); auto metadata = std::vector(storage.size); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( @@ -71,6 +81,7 @@ streaming::Node build_bloom_filter( (*merged)[i] |= static_cast((*data.metadata)[i]); } } + ctx->comm()->logger().print("Bloom filter build took ", Clock::now() - start); co_await ch_out->send(streaming::Message{0, std::move(merged), {}, {}}); co_await ch_out->drain(ctx->executor()); } From 04e09f590d70901e806d695c2db4f450024852c7 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 16:24:56 +0000 Subject: [PATCH 22/75] More timings --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 9aa269292..ac15ccca6 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -61,6 +61,7 @@ streaming::Node build_bloom_filter( } ctx->comm()->logger().print("Bloom filter local build took ", Clock::now() - start); + auto t0 = Clock::now(); auto allgather = streaming::AllGather(ctx, tag); auto metadata = std::vector(storage.size); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( @@ -73,8 +74,11 @@ streaming::Node build_bloom_filter( {std::make_unique>(std::move(metadata)), ctx->br()->allocate(stream, std::move(res))} ); + ctx->comm()->logger().print("Bloom filter allgather insertion ", Clock::now() - t0); + t0 = Clock::now(); allgather.insert_finished(); auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); + ctx->comm()->logger().print("Bloom filter extract all took ", Clock::now() - t0); auto merged = std::make_unique>(storage.size); for (auto&& data : per_rank) { for (std::size_t i = 0; i < storage.size; i++) { From f8e483cb4aae185ee5188b641279543968c57212 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 16:34:55 +0000 Subject: [PATCH 23/75] Now? --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index ac15ccca6..621a693d1 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -69,16 +69,21 @@ streaming::Node build_bloom_filter( )); stream.synchronize(); auto [res, _] = ctx->br()->reserve(MemoryType::HOST, 0, true); + ctx->comm()->logger().print("Bloom filter insertion starting at time ", Clock::now()); allgather.insert( 0, {std::make_unique>(std::move(metadata)), ctx->br()->allocate(stream, std::move(res))} ); - ctx->comm()->logger().print("Bloom filter allgather insertion ", Clock::now() - t0); + ctx->comm()->logger().print( + "Bloom filter allgather insertion ", Clock::now() - t0, " ", Clock::now() - start + ); t0 = Clock::now(); allgather.insert_finished(); auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); - ctx->comm()->logger().print("Bloom filter extract all took ", Clock::now() - t0); + ctx->comm()->logger().print( + "Bloom filter extract all took ", Clock::now() - t0, " ", Clock::now() - start + ); auto merged = std::make_unique>(storage.size); for (auto&& data : per_rank) { for (std::size_t i = 0; i < storage.size; i++) { From c0cd4063f3b1af8b0f1a0383c83abdadd8fd1aea Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 16:37:41 +0000 Subject: [PATCH 24/75] Print time in logging output --- cpp/include/rapidsmpf/communicator/communicator.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/rapidsmpf/communicator/communicator.hpp b/cpp/include/rapidsmpf/communicator/communicator.hpp index f5b248069..fca608f79 100644 --- a/cpp/include/rapidsmpf/communicator/communicator.hpp +++ b/cpp/include/rapidsmpf/communicator/communicator.hpp @@ -17,6 +17,8 @@ #include #include +#include "rapidsmpf/utils.hpp" + /** * @namespace rapidsmpf * @brief RAPIDS Multi-Processor interfaces. @@ -337,7 +339,7 @@ class Communicator { virtual void do_log(LOG_LEVEL level, std::ostringstream&& ss) { std::ostringstream full_log_msg; full_log_msg << "[" << level_name(level) << ":" << comm_->rank() << ":" - << get_thread_id() << "] " << ss.str(); + << get_thread_id() << ":" << Clock::now() << "] " << ss.str(); std::lock_guard lock(mutex_); std::cout << full_log_msg.str() << std::endl; } From f74833c97b9e6b179134eba6d067c7dc993efa76 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 16:46:46 +0000 Subject: [PATCH 25/75] Avoid an alloc --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 621a693d1..6d895c15f 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -84,10 +84,11 @@ streaming::Node build_bloom_filter( ctx->comm()->logger().print( "Bloom filter extract all took ", Clock::now() - t0, " ", Clock::now() - start ); - auto merged = std::make_unique>(storage.size); + auto merged = std::move(per_rank.back().metadata); + per_rank.pop_back(); for (auto&& data : per_rank) { for (std::size_t i = 0; i < storage.size; i++) { - (*merged)[i] |= static_cast((*data.metadata)[i]); + (*merged)[i] |= (*data.metadata)[i]; } } ctx->comm()->logger().print("Bloom filter build took ", Clock::now() - start); @@ -113,7 +114,7 @@ streaming::Node apply_bloom_filter( CudaEvent event; RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( storage.data, - data.get>().data(), + data.get>().data(), storage.size, cudaMemcpyDefault, stream From c2a0bc3835560d27ba8f2fb1e9339a195f9e46a8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 16:48:25 +0000 Subject: [PATCH 26/75] More --- cpp/benchmarks/streaming/ndsh/bloom_filter.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 6d895c15f..849fb57e1 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -60,16 +60,22 @@ streaming::Node build_bloom_filter( cuda_stream_join(stream, chunk.stream(), &event); } - ctx->comm()->logger().print("Bloom filter local build took ", Clock::now() - start); + ctx->comm()->logger().print( + "Bloom filter of ", storage.size, " bytes local build took ", Clock::now() - start + ); auto t0 = Clock::now(); - auto allgather = streaming::AllGather(ctx, tag); auto metadata = std::vector(storage.size); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( metadata.data(), storage.data, storage.size, cudaMemcpyDefault, stream.value() )); stream.synchronize(); + ctx->comm()->logger().print( + "Bloom filter allocate and copy to host took ", t0 - start + ); + t0 = Clock::now(); auto [res, _] = ctx->br()->reserve(MemoryType::HOST, 0, true); ctx->comm()->logger().print("Bloom filter insertion starting at time ", Clock::now()); + auto allgather = streaming::AllGather(ctx, tag); allgather.insert( 0, {std::make_unique>(std::move(metadata)), From 5d667870fbe66f2270f15c0b56a02a024e60bcdc Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 17:58:53 +0000 Subject: [PATCH 27/75] Propagate exceptions from parquet chunk read failures --- cpp/benchmarks/streaming/ndsh/q03.cpp | 13 +++++++------ cpp/src/streaming/cudf/parquet.cpp | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index bf1692397..768c642f3 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -118,6 +118,7 @@ rapidsmpf::streaming::Node read_lineitem( "l_discount", // 2 }) .build(); + using timestamp_type = cudf::timestamp_ms; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -128,14 +129,13 @@ rapidsmpf::streaming::Node read_lineitem( ); auto sys_days = cuda::std::chrono::sys_days(date); owner->push_back( - std::make_shared>( + std::make_shared>( sys_days.time_since_epoch(), true, stream ) ); owner->push_back( std::make_shared( - *std::any_cast< - std::shared_ptr>>( + *std::any_cast>>( owner->at(0) ) ) @@ -183,6 +183,7 @@ rapidsmpf::streaming::Node read_orders( "o_custkey" // 3 }) .build(); + using timestamp_type = cudf::timestamp_ms; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -193,14 +194,13 @@ rapidsmpf::streaming::Node read_orders( ); auto sys_days = cuda::std::chrono::sys_days(date); owner->push_back( - std::make_shared>( + std::make_shared>( sys_days.time_since_epoch(), true, stream ) ); owner->push_back( std::make_shared( - *std::any_cast< - std::shared_ptr>>( + *std::any_cast>>( owner->at(0) ) ) @@ -358,6 +358,7 @@ rapidsmpf::streaming::Node top_k_by( } // TODO: multi-node + RAPIDSMPF_EXPECTS(chunk_streams.size() > 0, "No chunks to sort"); auto out_stream = chunk_streams.front(); rapidsmpf::CudaEvent event; rapidsmpf::cuda_stream_join( diff --git a/cpp/src/streaming/cudf/parquet.cpp b/cpp/src/streaming/cudf/parquet.cpp index 8ac1eefd7..26d8c1fc3 100644 --- a/cpp/src/streaming/cudf/parquet.cpp +++ b/cpp/src/streaming/cudf/parquet.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -90,9 +91,21 @@ Node produce_chunks( co_await ctx->executor()->schedule(); // TODO: This reads the metadata ntasks times. // See https://github.com/rapidsai/cudf/issues/20311 - auto sent = co_await ticket->send( - read_parquet_chunk(ctx, stream, chunk_options, chunk.sequence_number) - ); + auto [msg, exception] = [&]() -> std::pair { + try { + return { + read_parquet_chunk(ctx, stream, chunk_options, chunk.sequence_number), + nullptr + }; + } catch (...) { + return {Message{}, std::current_exception()}; + } + }(); + if (exception != nullptr) { + co_await ch_out->shutdown(); + std::rethrow_exception(exception); + } + auto sent = co_await ticket->send(std::move(msg)); if (!sent) { // Output channel is shutdown, no need for more reads. break; From b42ca4efdef453e35f35cba8236bbac7c62db9cd Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 10 Dec 2025 17:59:10 +0000 Subject: [PATCH 28/75] Try merging bloom filters on device --- .../streaming/ndsh/bloom_filter.cpp | 59 ++++++++++--------- .../streaming/ndsh/bloom_filter_impl.cu | 32 ++++++++++ .../streaming/ndsh/bloom_filter_impl.hpp | 6 ++ 3 files changed, 69 insertions(+), 28 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 849fb57e1..830967d93 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -64,23 +64,20 @@ streaming::Node build_bloom_filter( "Bloom filter of ", storage.size, " bytes local build took ", Clock::now() - start ); auto t0 = Clock::now(); - auto metadata = std::vector(storage.size); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - metadata.data(), storage.data, storage.size, cudaMemcpyDefault, stream.value() - )); - stream.synchronize(); + auto metadata = std::make_unique>(1); + auto [res, _] = ctx->br()->reserve(MemoryType::DEVICE, storage.size, true); + auto buf = ctx->br()->allocate(stream, std::move(res)); + buf->write_access([&](std::byte* data, rmm::cuda_stream_view stream) { + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + data, storage.data, storage.size, cudaMemcpyDefault, stream.value() + )); + }); ctx->comm()->logger().print( - "Bloom filter allocate and copy to host took ", t0 - start + "Bloom filter allocate and copy to buf took ", t0 - start ); t0 = Clock::now(); - auto [res, _] = ctx->br()->reserve(MemoryType::HOST, 0, true); - ctx->comm()->logger().print("Bloom filter insertion starting at time ", Clock::now()); auto allgather = streaming::AllGather(ctx, tag); - allgather.insert( - 0, - {std::make_unique>(std::move(metadata)), - ctx->br()->allocate(stream, std::move(res))} - ); + allgather.insert(0, {std::move(metadata), std::move(buf)}); ctx->comm()->logger().print( "Bloom filter allgather insertion ", Clock::now() - t0, " ", Clock::now() - start ); @@ -90,15 +87,28 @@ streaming::Node build_bloom_filter( ctx->comm()->logger().print( "Bloom filter extract all took ", Clock::now() - t0, " ", Clock::now() - start ); - auto merged = std::move(per_rank.back().metadata); - per_rank.pop_back(); + auto temp_storage = create_filter_storage(num_blocks, stream, mr); for (auto&& data : per_rank) { - for (std::size_t i = 0; i < storage.size; i++) { - (*merged)[i] |= (*data.metadata)[i]; - } + cuda_stream_join(data.data->stream(), stream, &event); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + temp_storage.data, + data.data->data(), + temp_storage.size, + cudaMemcpyDefault, + stream + )); + cuda_stream_join(stream, data.data->stream(), &event); + merge_filters(storage, temp_storage, num_blocks, stream); } ctx->comm()->logger().print("Bloom filter build took ", Clock::now() - start); - co_await ch_out->send(streaming::Message{0, std::move(merged), {}, {}}); + co_await ch_out->send( + streaming::Message{ + 0, + std::make_unique(std::move(storage)), + {}, + {} + } + ); co_await ch_out->drain(ctx->executor()); } @@ -115,16 +125,9 @@ streaming::Node apply_bloom_filter( co_await ctx->executor()->schedule(); auto data = co_await bloom_filter->receive(); RAPIDSMPF_EXPECTS(!data.empty(), "Bloom filter channel was shutdown"); - auto stream = ctx->br()->stream_pool().get_stream(); - auto storage = create_filter_storage(num_blocks, stream, ctx->br()->device_mr()); + auto storage = data.release(); + auto stream = storage.stream; CudaEvent event; - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - storage.data, - data.get>().data(), - storage.size, - cudaMemcpyDefault, - stream - )); while (!ch_out->is_shutdown()) { auto msg = co_await ch_in->receive(); if (msg.empty()) { diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu index f5dc5e58c..137adeb97 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ #include #include +#include #include #include @@ -103,6 +105,36 @@ rmm::device_uvector apply_filter( return result; } +void merge_filters( + aligned_buffer& storage, + const aligned_buffer& other, + std::size_t num_blocks, + rmm::cuda_stream_view stream +) { + auto ref_out = BloomFilterRef{ + static_cast(storage.data), + num_blocks, + cuco::thread_scope_device, + PolicyType{} + }; + auto ref_in = BloomFilterRef{ + static_cast(other.data), + num_blocks, + cuco::thread_scope_device, + PolicyType{} + }; + using word_type = BloomFilterRef::word_type; + RAPIDSMPF_CUDA_TRY( + cub::DeviceTransform::Transform( + cuda::std::tuple{ref_out.data(), ref_in.data()}, + ref_out.data(), + num_blocks * BloomFilterRef::words_per_block, + [] __device__(word_type left, word_type right) { return left | right; }, + stream.value() + ) + ); +} + std::size_t num_filter_blocks(int l2cachesize) { return (static_cast(l2cachesize) * 2) / (3 * sizeof(StorageType)); } diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp index e503f1b23..ba5bf13df 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp @@ -110,6 +110,12 @@ void update_filter( rmm::device_async_resource_ref mr ); +void merge_filters( + aligned_buffer& storage, + aligned_buffer const& other, + std::size_t num_blocks, + rmm::cuda_stream_view stream +); /** * @brief Apply the filter to fingerprints from a table. * From cf3d990b3e4bb20e66cf702ac5850396b44d353a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Dec 2025 16:10:43 +0000 Subject: [PATCH 29/75] Remove debug --- .../streaming/ndsh/bloom_filter.cpp | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 830967d93..7c751c95c 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -43,14 +43,8 @@ streaming::Node build_bloom_filter( RAPIDSMPF_CUDA_TRY(cudaMemsetAsync(storage.data, 0, storage.size, stream)); CudaEvent storage_event; storage_event.record(stream); - auto start = Clock::now(); - bool started = false; while (true) { auto msg = co_await ch_in->receive(); - if (!started) { - start = Clock::now(); - started = true; - } if (msg.empty()) { break; } @@ -60,10 +54,6 @@ streaming::Node build_bloom_filter( cuda_stream_join(stream, chunk.stream(), &event); } - ctx->comm()->logger().print( - "Bloom filter of ", storage.size, " bytes local build took ", Clock::now() - start - ); - auto t0 = Clock::now(); auto metadata = std::make_unique>(1); auto [res, _] = ctx->br()->reserve(MemoryType::DEVICE, storage.size, true); auto buf = ctx->br()->allocate(stream, std::move(res)); @@ -72,21 +62,10 @@ streaming::Node build_bloom_filter( data, storage.data, storage.size, cudaMemcpyDefault, stream.value() )); }); - ctx->comm()->logger().print( - "Bloom filter allocate and copy to buf took ", t0 - start - ); - t0 = Clock::now(); auto allgather = streaming::AllGather(ctx, tag); allgather.insert(0, {std::move(metadata), std::move(buf)}); - ctx->comm()->logger().print( - "Bloom filter allgather insertion ", Clock::now() - t0, " ", Clock::now() - start - ); - t0 = Clock::now(); allgather.insert_finished(); auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); - ctx->comm()->logger().print( - "Bloom filter extract all took ", Clock::now() - t0, " ", Clock::now() - start - ); auto temp_storage = create_filter_storage(num_blocks, stream, mr); for (auto&& data : per_rank) { cuda_stream_join(data.data->stream(), stream, &event); @@ -100,7 +79,6 @@ streaming::Node build_bloom_filter( cuda_stream_join(stream, data.data->stream(), &event); merge_filters(storage, temp_storage, num_blocks, stream); } - ctx->comm()->logger().print("Bloom filter build took ", Clock::now() - start); co_await ch_out->send( streaming::Message{ 0, @@ -159,12 +137,6 @@ streaming::Node apply_bloom_filter( auto result = cudf::apply_boolean_mask( chunk.table_view(), mask_view, chunk_stream, ctx->br()->device_mr() ); - ctx->comm()->logger().print( - "Sending filtered chunk ", - result->num_rows(), - " before ", - chunk.table_view().num_rows() - ); std::ignore = std::move(chunk); co_await ch_out->send(to_message( msg.sequence_number(), From eacf67ac5f6a921fa24c5dbcc54bd3f614e0930b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Dec 2025 18:19:17 +0000 Subject: [PATCH 30/75] Done --- .../streaming/ndsh/bloom_filter.cpp | 84 ++++----- .../streaming/ndsh/bloom_filter.hpp | 17 +- .../streaming/ndsh/bloom_filter_impl.cu | 134 ++++++++------- .../streaming/ndsh/bloom_filter_impl.hpp | 160 ++++++++++-------- cpp/benchmarks/streaming/ndsh/q03.cpp | 13 +- 5 files changed, 205 insertions(+), 203 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp index 7c751c95c..b1613f572 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.cpp @@ -22,7 +22,6 @@ #include #include -#include "bloom_filter_impl.hpp" #include "utils.hpp" namespace rapidsmpf::ndsh { @@ -39,54 +38,44 @@ streaming::Node build_bloom_filter( auto mr = ctx->br()->device_mr(); auto stream = ctx->br()->stream_pool().get_stream(); CudaEvent event; - auto storage = create_filter_storage(num_blocks, stream, mr); - RAPIDSMPF_CUDA_TRY(cudaMemsetAsync(storage.data, 0, storage.size, stream)); - CudaEvent storage_event; - storage_event.record(stream); + auto filter = std::make_unique(num_blocks, seed, stream, mr); + CudaEvent build_event; + build_event.record(stream); while (true) { auto msg = co_await ch_in->receive(); if (msg.empty()) { break; } auto chunk = to_device(ctx, msg.release()); - storage_event.stream_wait(chunk.stream()); - update_filter(storage, num_blocks, chunk.table_view(), seed, chunk.stream(), mr); + build_event.stream_wait(chunk.stream()); + filter->add(chunk.table_view(), chunk.stream(), mr); cuda_stream_join(stream, chunk.stream(), &event); } - auto metadata = std::make_unique>(1); - auto [res, _] = ctx->br()->reserve(MemoryType::DEVICE, storage.size, true); - auto buf = ctx->br()->allocate(stream, std::move(res)); - buf->write_access([&](std::byte* data, rmm::cuda_stream_view stream) { - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - data, storage.data, storage.size, cudaMemcpyDefault, stream.value() - )); - }); - auto allgather = streaming::AllGather(ctx, tag); - allgather.insert(0, {std::move(metadata), std::move(buf)}); - allgather.insert_finished(); - auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); - auto temp_storage = create_filter_storage(num_blocks, stream, mr); - for (auto&& data : per_rank) { - cuda_stream_join(data.data->stream(), stream, &event); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - temp_storage.data, - data.data->data(), - temp_storage.size, - cudaMemcpyDefault, - stream - )); - cuda_stream_join(stream, data.data->stream(), &event); - merge_filters(storage, temp_storage, num_blocks, stream); - } - co_await ch_out->send( - streaming::Message{ - 0, - std::make_unique(std::move(storage)), - {}, - {} + if (ctx->comm()->nranks() > 1) { + auto metadata = std::make_unique>(1); + auto [res, _] = ctx->br()->reserve(MemoryType::DEVICE, filter->size(), true); + auto buf = ctx->br()->allocate(stream, std::move(res)); + buf->write_access([&](std::byte* data, rmm::cuda_stream_view stream) { + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + data, filter->data(), filter->size(), cudaMemcpyDefault, stream.value() + )); + }); + auto allgather = streaming::AllGather(ctx, tag); + allgather.insert(0, {std::move(metadata), std::move(buf)}); + allgather.insert_finished(); + auto per_rank = co_await allgather.extract_all(streaming::AllGather::Ordered::NO); + auto other = BloomFilter(num_blocks, seed, stream, mr); + for (auto&& data : per_rank) { + cuda_stream_join(data.data->stream(), stream, &event); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + other.data(), data.data->data(), other.size(), cudaMemcpyDefault, stream + )); + cuda_stream_join(stream, data.data->stream(), &event); + filter->merge(other, stream); } - ); + } + co_await ch_out->send(streaming::Message{0, std::move(filter), {}, {}}); co_await ch_out->drain(ctx->executor()); } @@ -95,16 +84,14 @@ streaming::Node apply_bloom_filter( std::shared_ptr bloom_filter, std::shared_ptr ch_in, std::shared_ptr ch_out, - std::vector keys, - std::uint64_t seed, - std::size_t num_blocks + std::vector keys ) { streaming::ShutdownAtExit c{bloom_filter, ch_in, ch_out}; co_await ctx->executor()->schedule(); auto data = co_await bloom_filter->receive(); RAPIDSMPF_EXPECTS(!data.empty(), "Bloom filter channel was shutdown"); - auto storage = data.release(); - auto stream = storage.stream; + auto filter = data.release(); + auto stream = filter.stream(); CudaEvent event; while (!ch_out->is_shutdown()) { auto msg = co_await ch_in->receive(); @@ -114,13 +101,8 @@ streaming::Node apply_bloom_filter( auto chunk = to_device(ctx, msg.release()); auto chunk_stream = chunk.stream(); cuda_stream_join(chunk_stream, stream, &event); - auto mask = apply_filter( - storage, - num_blocks, - chunk.table_view().select(keys), - seed, - chunk_stream, - ctx->br()->device_mr() + auto mask = filter.contains( + chunk.table_view().select(keys), chunk_stream, ctx->br()->device_mr() ); cuda_stream_join(stream, chunk_stream, &event); RAPIDSMPF_EXPECTS( diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp b/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp index ca0218f58..7f0629060 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter.hpp @@ -14,6 +14,8 @@ #include #include +#include "bloom_filter_impl.hpp" + namespace rapidsmpf::ndsh { /** @@ -45,8 +47,6 @@ namespace rapidsmpf::ndsh { * @param ch_in Input channel of `TableChunk`s to apply bloom filter to. * @param ch_out Output channel receiving filtered `TableChunk`s. * @param keys Indices selecting the key columns for the hash fingerprint - * @param seed Hash seed for hashing the keys. - * @param num_filter_blocks Number of blocks in the filter. * * @return Coroutine representing the application of the bloom filter. */ @@ -55,18 +55,7 @@ streaming::Node apply_bloom_filter( std::shared_ptr bloom_filter, std::shared_ptr ch_in, std::shared_ptr ch_out, - std::vector keys, - std::uint64_t seed, - std::size_t num_filter_blocks + std::vector keys ); -/** - * @brief Return number of filter blocks for bloom filter given an L2 cache size - * - * @param l2cachesize Size of L2 cache in bytes - * - * @return Number of blocks to use. - */ -std::size_t num_filter_blocks(int l2cachesize); - } // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu index 137adeb97..31a0c3736 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -11,13 +11,14 @@ #include #include -#include #include +#include #include #include #include #include +#include #include #include @@ -33,45 +34,46 @@ namespace rapidsmpf::ndsh { +namespace { using KeyType = std::uint64_t; using PolicyType = cuco::arrow_filter_policy; -using BloomFilter = cuco::bloom_filter< - std::uint64_t, +using BloomFilterRefType = cuco::bloom_filter_ref< + KeyType, cuco::extent, - cuda::thread_scope_device, - PolicyType, - rmm::mr::polymorphic_allocator>; + cuco::thread_scope_device, + PolicyType>; +using StorageType = BloomFilterRefType::filter_block_type; -using BloomFilterRef = BloomFilter::ref_type; -using StorageType = BloomFilterRef::filter_block_type; +} // namespace -aligned_buffer create_filter_storage( +BloomFilter::BloomFilter( std::size_t num_blocks, + std::uint64_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr -) { - return aligned_buffer{ - num_blocks * sizeof(StorageType), std::alignment_of_v, stream, mr - }; +) + : num_blocks_{num_blocks}, + seed_{seed}, + storage_{ + num_blocks * sizeof(StorageType), std::alignment_of_v, stream, mr + } { + RAPIDSMPF_CUDA_TRY(cudaMemsetAsync(storage_.data, 0, storage_.size, stream)); } -void update_filter( - aligned_buffer& storage, - std::size_t num_blocks, +void BloomFilter::add( cudf::table_view const& values_to_hash, - std::uint64_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { RAPIDSMPF_NVTX_FUNC_RANGE(); - auto filter_ref = BloomFilterRef{ - static_cast(storage.data), - num_blocks, + auto filter_ref = BloomFilterRefType{ + static_cast(storage_.data), + num_blocks_, cuco::thread_scope_device, PolicyType{} }; - auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed, stream, mr); + auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed_, stream, mr); auto view = hashes->view(); RAPIDSMPF_EXPECTS( view.type().id() == cudf::type_to_id(), @@ -80,62 +82,70 @@ void update_filter( filter_ref.add_async(view.begin(), view.end(), stream); } -rmm::device_uvector apply_filter( - aligned_buffer& storage, - std::size_t num_blocks, - cudf::table_view const& values_to_hash, - std::uint64_t seed, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr -) { +void BloomFilter::merge(BloomFilter const& other, rmm::cuda_stream_view stream) { RAPIDSMPF_NVTX_FUNC_RANGE(); - auto policy = PolicyType{}; - auto filter_ref = BloomFilterRef{ - static_cast(storage.data), - num_blocks, - cuco::thread_scope_device, - policy - }; - auto hashes = cudf::hashing::xxhash_64(values_to_hash, seed, stream, mr); - auto view = hashes->view(); - rmm::device_uvector result(static_cast(view.size()), stream, mr); - filter_ref.contains_async( - view.begin(), view.end(), result.begin(), stream + RAPIDSMPF_EXPECTS( + num_blocks_ == other.num_blocks_, "Mismatching number of blocks in filters" ); - return result; -} - -void merge_filters( - aligned_buffer& storage, - const aligned_buffer& other, - std::size_t num_blocks, - rmm::cuda_stream_view stream -) { - auto ref_out = BloomFilterRef{ - static_cast(storage.data), - num_blocks, + auto ref_this = BloomFilterRefType{ + static_cast(storage_.data), + num_blocks_, cuco::thread_scope_device, PolicyType{} }; - auto ref_in = BloomFilterRef{ - static_cast(other.data), - num_blocks, + auto ref_other = BloomFilterRefType{ + static_cast(other.storage_.data), + num_blocks_, cuco::thread_scope_device, PolicyType{} }; - using word_type = BloomFilterRef::word_type; + using word_type = BloomFilterRefType::word_type; RAPIDSMPF_CUDA_TRY( cub::DeviceTransform::Transform( - cuda::std::tuple{ref_out.data(), ref_in.data()}, - ref_out.data(), - num_blocks * BloomFilterRef::words_per_block, + cuda::std::tuple{ref_this.data(), ref_other.data()}, + ref_this.data(), + num_blocks_ * BloomFilterRefType::words_per_block, [] __device__(word_type left, word_type right) { return left | right; }, stream.value() ) ); } -std::size_t num_filter_blocks(int l2cachesize) { - return (static_cast(l2cachesize) * 2) / (3 * sizeof(StorageType)); +rmm::device_uvector BloomFilter::contains( + cudf::table_view const& values, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr +) { + RAPIDSMPF_NVTX_FUNC_RANGE(); + auto filter_ref = BloomFilterRefType{ + static_cast(storage_.data), + num_blocks_, + cuco::thread_scope_device, + PolicyType{} + }; + auto hashes = cudf::hashing::xxhash_64(values, seed_, stream, mr); + auto view = hashes->view(); + rmm::device_uvector result{static_cast(view.size()), stream, mr}; + filter_ref.contains_async( + view.begin(), view.end(), result.begin(), stream + ); + return result; +} + +std::size_t BloomFilter::fitting_num_blocks(std::size_t l2size) { + return (l2size * 2) / (3 * sizeof(StorageType)); +} + +rmm::cuda_stream_view BloomFilter::stream() const noexcept { + return storage_.stream; } + +void* BloomFilter::data() const noexcept { + return storage_.data; +} + +std::size_t BloomFilter::size() const noexcept { + return storage_.size; +} + } // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp index ba5bf13df..9b59218c7 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.hpp @@ -17,7 +17,7 @@ namespace rapidsmpf::ndsh { /** * @brief A type-erased buffer with an allocation with specified alignment. */ -struct aligned_buffer { +struct AlignedBuffer { /** * @brief Construct the buffer. * @@ -26,7 +26,7 @@ struct aligned_buffer { * @param stream Stream for allocations. * @param mr Memory resource for allocations. */ - explicit aligned_buffer( + explicit AlignedBuffer( std::size_t size, std::size_t alignment, rmm::cuda_stream_view stream, @@ -41,21 +41,21 @@ struct aligned_buffer { /** * @brief Deallocate the buffer. */ - ~aligned_buffer() { + ~AlignedBuffer() { mr.deallocate(stream, data, size, alignment); } - aligned_buffer(aligned_buffer const&) = delete; - aligned_buffer& operator=(aligned_buffer const&) = delete; + AlignedBuffer(AlignedBuffer const&) = delete; + AlignedBuffer& operator=(AlignedBuffer const&) = delete; - aligned_buffer(aligned_buffer&& other) + AlignedBuffer(AlignedBuffer&& other) : size{other.size}, alignment{other.alignment}, stream{other.stream}, mr{other.mr}, data{std::exchange(other.data, nullptr)} {} - aligned_buffer& operator=(aligned_buffer&& other) { + AlignedBuffer& operator=(AlignedBuffer&& other) { if (this != &other) { RAPIDSMPF_EXPECTS( !data, @@ -71,69 +71,95 @@ struct aligned_buffer { return *this; } - std::size_t size; - std::size_t alignment; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - void* data; + std::size_t size; ///< Size in bytes + std::size_t alignment; ///< Alignment in bytes + rmm::cuda_stream_view stream; ///< Stream we were allocated on + rmm::device_async_resource_ref mr; ///< Memory resource for deallocation + void* data; ///< Data }; /** - * @brief Create device storage for the bloom filter. - * - * @param num_blocks Number of blocks. - * @param stream CUDA stream for device launches and allocations. - * @param mr Memory resource for allocations. + * @brief A bloom filter, used for approximate set membership queries. */ -aligned_buffer create_filter_storage( - std::size_t num_blocks, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr -); +struct BloomFilter { + /** + * @brief Create a filter. + * + * @param num_blocks Number of blocks in the filter. + * @param seed Seed used for hashing each value. + * @param stream CUDA stream for allocations and device operations. + * @param mr Memory resource for allocations. + */ + BloomFilter( + std::size_t num_blocks, + std::uint64_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr + ); + + /** + * @brief Add values to the filter. + * + * @param values_to_hash table of values to hash (with cudf::hashing::xxhash_64()) + * @param stream CUDA stream for allocations and device operations. + * @param mr Memory resource for allocations. + */ + void add( + cudf::table_view const& values_to_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr + ); + + /** + * @brief Merge two filters, computing their union. + * + * @param other Other filter to merge into this one. + * @param stream CUDA stream for device operations. + * + * @throws std::logic_error If `other` is not compatible with this filter. + */ + void merge(BloomFilter const& other, rmm::cuda_stream_view stream); + + /** + * @brief Return a mask of which rows are contained in the filter. + * + * @param values Value to check for set membership + * @param stream CUDA stream for allocations and device operations. + * @param mr Memory resource for allocations. + */ + [[nodiscard]] rmm::device_uvector contains( + cudf::table_view const& values, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr + ); + + /** + * @brief @return The stream the underlying storage is valid on. + */ + [[nodiscard]] rmm::cuda_stream_view stream() const noexcept; + + /** + * @brief @return Pointer to the underlying storage. + */ + [[nodiscard]] void* data() const noexcept; + + /** + * @brief @return Size in bytes of the underlying storage. + */ + [[nodiscard]] std::size_t size() const noexcept; + + /** + * @brief @return Number of blocks to use if the filter should fit in a given L2 cache + * size. + * + * @param l2size Size of the L2 cache in bytes. + */ + [[nodiscard]] static std::size_t fitting_num_blocks(std::size_t l2size); + + private: + std::size_t num_blocks_; ///< Number of blocks used in the filter. + std::uint64_t seed_; ///< Seed used when hashing values. + AlignedBuffer storage_; ///< Backing storage. +}; -/** - * @brief Update the filter with fingerprints from a table. - * - * @param storage Allocated device storage for the bloom filter - * @param num_blocks Number of blocks. - * @param values_to_hash Table of values to hash. - * @param seed Hash seed - * @param stream CUDA stream for device launches and allocations. - * @param mr Memory resource for allocations. - */ -void update_filter( - aligned_buffer& storage, - std::size_t num_blocks, - cudf::table_view const& values_to_hash, - std::uint64_t seed, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr -); - -void merge_filters( - aligned_buffer& storage, - aligned_buffer const& other, - std::size_t num_blocks, - rmm::cuda_stream_view stream -); -/** - * @brief Apply the filter to fingerprints from a table. - * - * @param storage Allocated device storage for the bloom filter - * @param num_blocks Number of blocks. - * @param values_to_hash Table of values to hash. - * @param seed Hash seed - * @param stream CUDA stream for device launches and allocations. - * @param mr Memory resource for allocations. - * - * @return Mask vector select rows in the table that were selected by the filter. - */ -rmm::device_uvector apply_filter( - aligned_buffer& storage, - std::size_t num_blocks, - cudf::table_view const& values_to_hash, - std::uint64_t seed, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr -); } // namespace rapidsmpf::ndsh diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 768c642f3..5ece5ab29 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -490,7 +490,9 @@ int main(int argc, char** argv) { int device; RAPIDSMPF_CUDA_TRY(cudaGetDevice(&device)); RAPIDSMPF_CUDA_TRY(cudaDeviceGetAttribute(&l2size, cudaDevAttrL2CacheSize, device)); - auto const num_filter_blocks = rapidsmpf::ndsh::num_filter_blocks(l2size); + auto const num_filter_blocks = rapidsmpf::ndsh::BloomFilter::fitting_num_blocks( + static_cast(l2size) + ); for (int i = 0; i < arguments.num_iterations; i++) { int op_id{0}; std::vector nodes; @@ -557,19 +559,12 @@ int main(int argc, char** argv) { auto lineitem_output = ctx->create_channel(); nodes.push_back( rapidsmpf::ndsh::apply_bloom_filter( - ctx, - bloom_filter_output, - lineitem, - lineitem_output, - {0}, - cudf::DEFAULT_HASH_SEED, - num_filter_blocks + ctx, bloom_filter_output, lineitem, lineitem_output, {0} ) ); // join o_orderkey = l_orderkey // Out: o_orderkey, o_orderdate, o_shippriority, l_extendedprice, // l_discount - // TODO: shuffle join option. if (arguments.use_shuffle_join) { auto lineitem_shuffled = ctx->create_channel(); auto customer_x_orders_shuffled = ctx->create_channel(); From aa6d9cc876655c40c2b163241f30f4ee76c23841 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Dec 2025 20:09:44 +0000 Subject: [PATCH 31/75] Thread safety in parquet write --- cpp/benchmarks/streaming/ndsh/parquet_writer.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp b/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp index f5a209ce0..639823645 100644 --- a/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp +++ b/cpp/benchmarks/streaming/ndsh/parquet_writer.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,8 @@ rapidsmpf::streaming::Node write_parquet( auto chunk = to_device(ctx, msg.release()); auto table = chunk.table_view(); auto metadata = cudf::io::table_input_metadata(table); + CudaEvent event; + auto write_stream = chunk.stream(); RAPIDSMPF_EXPECTS( column_names.size() == metadata.column_metadata.size(), "Mismatching number of column names and chunk columns" @@ -43,7 +46,7 @@ rapidsmpf::streaming::Node write_parquet( } builder = builder.metadata(metadata); auto options = builder.build(); - auto writer = cudf::io::chunked_parquet_writer(options); + auto writer = cudf::io::chunked_parquet_writer(options, write_stream); writer.write(table); while (true) { msg = co_await ch_in->receive(); @@ -56,7 +59,9 @@ rapidsmpf::streaming::Node write_parquet( static_cast(table.num_columns()) == column_names.size(), "Mismatching number of column names and chunk columns" ); + cuda_stream_join(write_stream, chunk.stream(), &event); writer.write(table); + cuda_stream_join(chunk.stream(), write_stream, &event); } writer.close(); } From 0e7f3fdfd634ea4277b445fc71f1ccaf0a79086a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Dec 2025 20:10:55 +0000 Subject: [PATCH 32/75] Fixes --- cpp/benchmarks/streaming/ndsh/q01.cpp | 73 +++-- cpp/benchmarks/streaming/ndsh/q03.cpp | 71 +++-- cpp/benchmarks/streaming/ndsh/q09.cpp | 426 ++++++++------------------ 3 files changed, 212 insertions(+), 358 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index b6f7fffad..ac2b63678 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include +#include "concatenate.hpp" #include "groupby.hpp" #include "join.hpp" #include "parquet_writer.hpp" @@ -62,6 +64,7 @@ rapidsmpf::streaming::Node read_lineitem( "l_tax" // 5 }) .build(); + using timestamp_type = cudf::timestamp_D; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -72,14 +75,13 @@ rapidsmpf::streaming::Node read_lineitem( ); auto sys_days = cuda::std::chrono::sys_days(date); owner->push_back( - std::make_shared>( + std::make_shared>( sys_days.time_since_epoch(), true, stream ) ); owner->push_back( std::make_shared( - *std::any_cast< - std::shared_ptr>>( + *std::any_cast>>( owner->at(0) ) ) @@ -314,6 +316,8 @@ static __device__ void calculate_charge(double *charge, double discprice, double */ int main(int argc, char** argv) { cudaFree(nullptr); + // work around https://github.com/rapidsai/cudf/issues/20849 + cudf::initialize(); auto mr = rmm::mr::cuda_async_memory_resource{}; auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); auto arguments = rapidsmpf::ndsh::parse_arguments(argc, argv); @@ -339,45 +343,55 @@ int main(int argc, char** argv) { arguments.input_directory )); - auto groupby_input = ctx->create_channel(); + auto chunkwise_groupby_input = ctx->create_channel(); // Out: l_returnflag, l_linestatus, l_quantity, l_extendedprice, // disc_price = (l_extendedprice * (1 - l_discount)), // charge = (l_extendedprice * (1 - l_discount) * (1 + l_tax)) // l_discount - nodes.push_back(select_columns_for_groupby(ctx, lineitem, groupby_input)); - auto chunkwise_groupby = ctx->create_channel(); nodes.push_back( - rapidsmpf::ndsh::chunkwise_group_by( - ctx, - groupby_input, - chunkwise_groupby, - {0, 1}, - chunkwise_groupby_requests(), - cudf::null_policy::INCLUDE - ) - ); - auto final_groupby_input = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::broadcast( - ctx, - chunkwise_groupby, - final_groupby_input, - static_cast(10 * i + op_id++), - rapidsmpf::streaming::AllGather::Ordered::NO - ) + select_columns_for_groupby(ctx, lineitem, chunkwise_groupby_input) ); - auto final_groupby_output = ctx->create_channel(); + auto chunkwise_groupby_output = ctx->create_channel(); nodes.push_back( rapidsmpf::ndsh::chunkwise_group_by( ctx, - final_groupby_input, - final_groupby_output, + chunkwise_groupby_input, + chunkwise_groupby_output, {0, 1}, - final_groupby_requests(), + chunkwise_groupby_requests(), cudf::null_policy::INCLUDE ) ); + auto final_groupby_input = ctx->create_channel(); + if (ctx->comm()->nranks() > 1) { + nodes.push_back( + rapidsmpf::ndsh::broadcast( + ctx, + chunkwise_groupby_output, + final_groupby_input, + static_cast(10 * i + op_id++), + rapidsmpf::streaming::AllGather::Ordered::NO + ) + ); + } else { + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, chunkwise_groupby_output, final_groupby_input + ) + ); + } if (ctx->comm()->rank() == 0) { + auto final_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0, 1}, + final_groupby_requests(), + cudf::null_policy::INCLUDE + ) + ); auto sorted_input = ctx->create_channel(); nodes.push_back( postprocess_group_by(ctx, final_groupby_output, sorted_input) @@ -413,7 +427,7 @@ int main(int argc, char** argv) { ) ); } else { - nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_output)); + nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_input)); } } auto end = std::chrono::steady_clock::now(); @@ -429,6 +443,7 @@ int main(int argc, char** argv) { timings.push_back(compute.count()); ctx->comm()->logger().print(ctx->statistics()->report()); } + if (ctx->comm()->rank() == 0) { for (int i = 0; i < arguments.num_iterations; i++) { ctx->comm()->logger().print( diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 5ece5ab29..fa9796d6a 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include #include "bloom_filter.hpp" +#include "concatenate.hpp" #include "groupby.hpp" #include "join.hpp" #include "parquet_writer.hpp" @@ -118,7 +120,7 @@ rapidsmpf::streaming::Node read_lineitem( "l_discount", // 2 }) .build(); - using timestamp_type = cudf::timestamp_ms; + using timestamp_type = cudf::timestamp_D; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -183,7 +185,7 @@ rapidsmpf::streaming::Node read_orders( "o_custkey" // 3 }) .build(); - using timestamp_type = cudf::timestamp_ms; + using timestamp_type = cudf::timestamp_D; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -369,8 +371,11 @@ rapidsmpf::streaming::Node top_k_by( return t->view(); }); auto merged = cudf::merge(views, keys, order, {}, out_stream, ctx->br()->device_mr()); - auto result = - std::make_unique(cudf::slice(merged->view(), {0, 10}, out_stream)); + auto result = std::make_unique( + cudf::slice(merged->view(), {0, 10}, out_stream), + out_stream, + ctx->br()->device_mr() + ); co_await ch_out->send( rapidsmpf::streaming::to_message( 0, @@ -480,6 +485,8 @@ rapidsmpf::streaming::Node fanout_bounded( int main(int argc, char** argv) { rapidsmpf::ndsh::FinalizeMPI finalize{}; cudaFree(nullptr); + // work around https://github.com/rapidsai/cudf/issues/20849 + cudf::initialize(); auto mr = rmm::mr::cuda_async_memory_resource{}; auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); auto arguments = rapidsmpf::ndsh::parse_arguments(argc, argv); @@ -632,30 +639,38 @@ int main(int argc, char** argv) { ) ); auto final_groupby_input = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::broadcast( - ctx, - chunkwise_groupby_output, - final_groupby_input, - static_cast(10 * i + op_id++), - rapidsmpf::streaming::AllGather::Ordered::NO - ) - ); - auto final_groupby_output = ctx->create_channel(); - // Out: o_orderkey, o_orderdate, o_shippriority, revenue - nodes.push_back( - rapidsmpf::ndsh::chunkwise_group_by( - ctx, - final_groupby_input, - final_groupby_output, - {0, 1, 2}, - chunkwise_groupby_requests(), - cudf::null_policy::INCLUDE - - ) - ); - auto topk = ctx->create_channel(); + if (ctx->comm()->nranks() > 1) { + nodes.push_back( + rapidsmpf::ndsh::broadcast( + ctx, + chunkwise_groupby_output, + final_groupby_input, + static_cast(10 * i + op_id++), + rapidsmpf::streaming::AllGather::Ordered::NO + ) + ); + } else { + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, chunkwise_groupby_output, final_groupby_input + ) + ); + } if (ctx->comm()->rank() == 0) { + auto final_groupby_output = ctx->create_channel(); + // Out: o_orderkey, o_orderdate, o_shippriority, revenue + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0, 1, 2}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE + + ) + ); + auto topk = ctx->create_channel(); // Out: o_orderkey, revenue, o_orderdate, o_shippriority nodes.push_back(top_k_by( ctx, @@ -675,7 +690,7 @@ int main(int argc, char** argv) { ) ); } else { - nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_output)); + nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_input)); } } auto end = std::chrono::steady_clock::now(); diff --git a/cpp/benchmarks/streaming/ndsh/q09.cpp b/cpp/benchmarks/streaming/ndsh/q09.cpp index 2653143c6..8c02ab24a 100644 --- a/cpp/benchmarks/streaming/ndsh/q09.cpp +++ b/cpp/benchmarks/streaming/ndsh/q09.cpp @@ -3,10 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include #include #include -#include #include #include @@ -14,14 +12,13 @@ #include #include +#include #include -#include #include #include #include #include #include -#include #include #include #include @@ -32,8 +29,6 @@ #include #include -#include -#include #include #include #include @@ -43,22 +38,14 @@ #include #include "concatenate.hpp" +#include "groupby.hpp" #include "join.hpp" +#include "parquet_writer.hpp" +#include "sort.hpp" #include "utils.hpp" namespace { -std::string get_table_path( - std::string const& input_directory, std::string const& table_name -) { - auto dir = input_directory.empty() ? "." : input_directory; - auto file_path = dir + "/" + table_name + ".parquet"; - if (std::filesystem::exists(file_path)) { - return file_path; - } - return dir + "/" + table_name + "/"; -} - rapidsmpf::streaming::Node read_lineitem( std::shared_ptr ctx, std::shared_ptr ch_out, @@ -67,7 +54,7 @@ rapidsmpf::streaming::Node read_lineitem( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "lineitem") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "lineitem") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns( @@ -92,7 +79,7 @@ rapidsmpf::streaming::Node read_nation( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "nation") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "nation") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"n_name", "n_nationkey"}) @@ -110,7 +97,7 @@ rapidsmpf::streaming::Node read_orders( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "orders") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "orders") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"o_orderdate", "o_orderkey"}) @@ -128,7 +115,7 @@ rapidsmpf::streaming::Node read_part( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "part") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "part") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"p_partkey", "p_name"}) @@ -146,7 +133,7 @@ rapidsmpf::streaming::Node read_partsupp( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "partsupp") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "partsupp") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"ps_partkey", "ps_suppkey", "ps_supplycost"}) @@ -164,7 +151,7 @@ rapidsmpf::streaming::Node read_supplier( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "supplier") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "supplier") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"s_nationkey", "s_suppkey"}) @@ -285,161 +272,15 @@ static __device__ void calculate_amount(double *amount, double discount, double co_await ch_out->drain(ctx->executor()); } -rapidsmpf::streaming::Node chunkwise_groupby_agg( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - std::uint64_t sequence = 0; - while (true) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - break; - } - co_await ctx->executor()->schedule(); - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - auto grouper = cudf::groupby::groupby( - table.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - sequence++, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); -} - -rapidsmpf::streaming::Node final_groupby_agg( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out, - rapidsmpf::OpID tag -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - // TODO: requires concatenated input stream. - auto msg = co_await ch_in->receive(); - auto next = co_await ch_in->receive(); - ctx->comm()->logger().print("Final groupby"); - RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - std::unique_ptr local_result{nullptr}; - if (!table.is_empty()) { - auto grouper = cudf::groupby::groupby( - table.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(table.column(2), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - local_result = std::make_unique(std::move(result)); - } - if (ctx->comm()->nranks() > 1) { - // Reduce across ranks... - // Need a reduce primitive in rapidsmpf, but let's just use an allgather and - // discard for now. - rapidsmpf::streaming::AllGather gatherer{ctx, tag}; - if (local_result) { - auto pack = - cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); - gatherer.insert( - 0, - {rapidsmpf::PackedData( - std::move(pack.metadata), - ctx->br()->move(std::move(pack.gpu_data), chunk_stream) - )} - ); - } - gatherer.insert_finished(); - auto packed_data = - co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); - if (ctx->comm()->rank() == 0) { - auto global_result = rapidsmpf::unpack_and_concat( - rapidsmpf::unspill_partitions( - std::move(packed_data), ctx->br(), true, ctx->statistics() - ), - chunk_stream, - ctx->br(), - ctx->statistics() - ); - - // We will only actually bother to do this on rank zero. - auto result_view = global_result->view(); - auto grouper = cudf::groupby::groupby( - result_view.select({0, 1}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(result_view.column(2), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - global_result.reset(); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } - } else { - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::move(local_result), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); +std::vector chunkwise_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + aggs.emplace_back(cudf::make_sum_aggregation); + requests.emplace_back(2, std::move(aggs)); + return requests; } -rapidsmpf::streaming::Node sort_by( +rapidsmpf::streaming::Node round_sum_profit( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out @@ -447,11 +288,9 @@ rapidsmpf::streaming::Node sort_by( rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; co_await ctx->executor()->schedule(); auto msg = co_await ch_in->receive(); - // We know we only have a single chunk from the groupby - if (msg.empty()) { - co_return; - } - ctx->comm()->logger().print("Sortby"); + RAPIDSMPF_EXPECTS(!msg.empty(), "Expecting to see a single chunk"); + auto next = co_await ch_in->receive(); + RAPIDSMPF_EXPECTS(next.empty(), "Not expecting to see a second chunk"); auto chunk = rapidsmpf::ndsh::to_device(ctx, msg.release()); auto table = chunk.table_view(); @@ -465,11 +304,8 @@ rapidsmpf::streaming::Node sort_by( auto result = rapidsmpf::streaming::to_message( 0, std::make_unique( - cudf::sort_by_key( + std::make_unique( cudf::table_view({table.column(0), table.column(1), rounded->view()}), - table.select({0, 1}), - {cudf::order::ASCENDING, cudf::order::DESCENDING}, - {cudf::null_order::BEFORE, cudf::null_order::BEFORE}, chunk.stream(), ctx->br()->device_mr() ), @@ -480,39 +316,6 @@ rapidsmpf::streaming::Node sort_by( co_await ch_out->drain(ctx->executor()); } -rapidsmpf::streaming::Node write_parquet( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::string output_path -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - co_return; - } - ctx->comm()->logger().print("write parquet"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto sink = cudf::io::sink_info(output_path); - auto builder = cudf::io::parquet_writer_options::builder(sink, chunk.table_view()); - auto metadata = cudf::io::table_input_metadata(chunk.table_view()); - metadata.column_metadata[0].set_name("nation"); - metadata.column_metadata[1].set_name("o_year"); - metadata.column_metadata[2].set_name("sum_profit"); - builder = builder.metadata(metadata); - auto options = builder.build(); - cudf::io::write_parquet(options, chunk.stream()); - ctx->comm()->logger().print( - "Wrote chunk with ", - chunk.table_view().num_rows(), - " rows and ", - chunk.table_view().num_columns(), - " columns to ", - output_path - ); -} - } // namespace /** @@ -556,70 +359,46 @@ rapidsmpf::streaming::Node write_parquet( */ int main(int argc, char** argv) { cudaFree(nullptr); - rapidsmpf::mpi::init(&argc, &argv); - MPI_Comm mpi_comm; - RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); - auto cmd_options = parse_options(argc, argv); - auto limit_size = rmm::percent_of_free_device_memory( - static_cast(cmd_options.spill_device_limit.value_or(1) * 100) - ); - rmm::mr::cuda_async_memory_resource mr{}; - auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); - rmm::device_async_resource_ref mr_ref(stats_mr); - rmm::mr::set_current_device_resource(&stats_mr); - rmm::mr::set_current_device_resource_ref(mr_ref); - std::unordered_map - memory_available{}; - if (cmd_options.spill_device_limit.has_value()) { - memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ - &stats_mr, static_cast(limit_size) - }; - } - auto br = std::make_shared( - stats_mr, - rapidsmpf::BufferResource::PinnedMemoryResourceDisabled, - std::move(memory_available) - ); - auto envvars = rapidsmpf::config::get_environment_variables(); - envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); - auto options = rapidsmpf::config::Options(envvars); - auto stats = std::make_shared(&stats_mr); - { - auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); - auto progress = - std::make_shared(comm->logger(), stats); - auto ctx = - std::make_shared(options, comm, br, stats); - comm->logger().print( - "Executor has ", ctx->executor()->thread_count(), " threads" - ); - comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); - - std::string output_path = cmd_options.output_file; - std::vector timings; - for (int i = 0; i < 2; i++) { - rapidsmpf::OpID op_id{0}; - std::vector nodes; - auto start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q9 pipeline"); - auto part = ctx->create_channel(); - auto filtered_part = ctx->create_channel(); - auto partsupp = ctx->create_channel(); - auto part_x_partsupp = ctx->create_channel(); - auto supplier = ctx->create_channel(); - auto lineitem = ctx->create_channel(); - auto supplier_x_part_x_partsupp = ctx->create_channel(); - auto supplier_x_part_x_partsupp_x_lineitem = ctx->create_channel(); - nodes.push_back(read_part( - ctx, - part, - /* num_tickets */ 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); // p_partkey, p_name - nodes.push_back(filter_part(ctx, part, filtered_part)); // p_partkey - nodes.push_back(read_partsupp( + // work around https://github.com/rapidsai/cudf/issues/20849 + cudf::initialize(); + auto mr = rmm::mr::cuda_async_memory_resource{}; + auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); + auto arguments = rapidsmpf::ndsh::parse_arguments(argc, argv); + auto ctx = rapidsmpf::ndsh::create_context(arguments, &stats_wrapper); + std::string output_path = arguments.output_file; + std::vector timings; + for (int i = 0; i < arguments.num_iterations; i++) { + rapidsmpf::OpID op_id{0}; + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q9 pipeline"); + auto part = ctx->create_channel(); + auto filtered_part = ctx->create_channel(); + auto partsupp = ctx->create_channel(); + auto part_x_partsupp = ctx->create_channel(); + auto supplier = ctx->create_channel(); + auto lineitem = ctx->create_channel(); + auto supplier_x_part_x_partsupp = ctx->create_channel(); + auto supplier_x_part_x_partsupp_x_lineitem = ctx->create_channel(); + nodes.push_back(read_part( + ctx, + part, + /* num_tickets */ 4, + arguments.num_rows_per_chunk, + arguments.input_directory + )); // p_partkey, p_name + nodes.push_back(filter_part(ctx, part, filtered_part)); // p_partkey + nodes.push_back(read_partsupp( + ctx, + partsupp, + /* num_tickets */ 4, + arguments.num_rows_per_chunk, + arguments.input_directory + )); // ps_partkey, ps_suppkey, ps_supplycost + nodes.push_back( + // p_partkey x ps_partkey + rapidsmpf::ndsh::inner_join_broadcast( ctx, filtered_part, partsupp, @@ -763,31 +542,76 @@ int main(int argc, char** argv) { ) // n_name, ps_supplycost, l_discount, l_extendedprice, // l_quantity, o_orderdate ); - auto groupby_input = ctx->create_channel(); - nodes.push_back(select_columns(ctx, all_joined, groupby_input)); + auto chunkwise_groupby_input = ctx->create_channel(); + nodes.push_back(select_columns(ctx, all_joined, chunkwise_groupby_input)); auto chunkwise_groupby_output = ctx->create_channel(); nodes.push_back( - chunkwise_groupby_agg(ctx, groupby_input, chunkwise_groupby_output) - ); - auto concatenated_groupby_output = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::concatenate( + rapidsmpf::ndsh::chunkwise_group_by( ctx, + chunkwise_groupby_input, chunkwise_groupby_output, - concatenated_groupby_output, - rapidsmpf::ndsh::ConcatOrder::DONT_CARE + {0, 1}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE ) ); - auto groupby_output = ctx->create_channel(); - nodes.push_back(final_groupby_agg( - ctx, - concatenated_groupby_output, - groupby_output, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} - )); - auto sorted_output = ctx->create_channel(); - nodes.push_back(sort_by(ctx, groupby_output, sorted_output)); - nodes.push_back(write_parquet(ctx, sorted_output, output_path)); + auto final_groupby_input = ctx->create_channel(); + if (ctx->comm()->nranks() > 1) { + nodes.push_back( + rapidsmpf::ndsh::broadcast( + ctx, + chunkwise_groupby_output, + final_groupby_input, + static_cast(10 * i + op_id++), + rapidsmpf::streaming::AllGather::Ordered::NO + ) + ); + } else { + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, chunkwise_groupby_output, final_groupby_input + ) + ); + } + if (ctx->comm()->rank() == 0) { + auto final_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0, 1}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE + ) + ); + auto sorted_input = ctx->create_channel(); + nodes.push_back( + round_sum_profit(ctx, final_groupby_output, sorted_input) + ); + auto sorted_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_sort_by( + ctx, + sorted_input, + sorted_output, + {0, 1}, + {0, 1, 2}, + {cudf::order::ASCENDING, cudf::order::DESCENDING}, + {cudf::null_order::BEFORE, cudf::null_order::BEFORE} + ) + ); + nodes.push_back( + rapidsmpf::ndsh::write_parquet( + ctx, + sorted_output, + cudf::io::sink_info{output_path}, + {"nation", "o_year", "sum_profit"} + ) + ); + } else { + nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_input)); + } } auto end = std::chrono::steady_clock::now(); std::chrono::duration pipeline = end - start; From 7a1485482ee82ec11905317377aaf5258cb8be38 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Dec 2025 20:17:00 +0000 Subject: [PATCH 33/75] Adapt to upstream changes --- cpp/benchmarks/streaming/ndsh/utils.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 29fb04f4a..3c09ac952 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -129,10 +130,11 @@ std::shared_ptr create_context( memory_available[MemoryType::DEVICE] = LimitAvailableMemory{mr, static_cast(limit_size)}; } - auto statistics = std::make_shared(mr); + auto statistics = std::make_shared(mr); - auto br = std::make_shared( + auto br = std::make_shared( mr, + BufferResource::PinnedMemoryResourceDisabled, std::move(memory_available), arguments.periodic_spill, std::make_shared( From 1df954a8fd32b689a07bfca5ce6d8dabe6349ec5 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Dec 2025 10:45:22 +0000 Subject: [PATCH 34/75] Fix timestamp types --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 3 ++- cpp/benchmarks/streaming/ndsh/q01.cpp | 4 +++- cpp/benchmarks/streaming/ndsh/q03.cpp | 7 +++++-- cpp/benchmarks/streaming/ndsh/q09.cpp | 1 + 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 89a72589c..dc00fc8c9 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -13,7 +13,8 @@ if(NOT RAPIDSMPF_HAVE_STREAMING) message(FATAL_ERROR "Streaming NDSH benchmarks require streaming support") endif() -add_library(rapidsmpfndsh bloom_filter.cpp bloom_filter_impl.cu concatenate.cpp groupby.cpp join.cpp parquet_writer.cpp sort.cpp utils.cpp) +add_library(rapidsmpfndsh bloom_filter.cpp bloom_filter_impl.cu concatenate.cpp groupby.cpp join.cpp + parquet_writer.cpp sort.cpp utils.cpp) set_target_properties( rapidsmpfndsh diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index ac2b63678..50fdfcd72 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -64,7 +64,8 @@ rapidsmpf::streaming::Node read_lineitem( "l_tax" // 5 }) .build(); - using timestamp_type = cudf::timestamp_D; + // TODO: utility to get logical types from parquet. + using timestamp_type = cudf::timestamp_ms; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -442,6 +443,7 @@ int main(int argc, char** argv) { timings.push_back(pipeline.count()); timings.push_back(compute.count()); ctx->comm()->logger().print(ctx->statistics()->report()); + ctx->statistics()->clear(); } if (ctx->comm()->rank() == 0) { diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index fa9796d6a..82408d585 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -120,7 +120,8 @@ rapidsmpf::streaming::Node read_lineitem( "l_discount", // 2 }) .build(); - using timestamp_type = cudf::timestamp_D; + // TODO: utility to get logical types from parquet. + using timestamp_type = cudf::timestamp_ms; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -185,7 +186,8 @@ rapidsmpf::streaming::Node read_orders( "o_custkey" // 3 }) .build(); - using timestamp_type = cudf::timestamp_D; + // TODO: utility to get logical types from parquet. + using timestamp_type = cudf::timestamp_ms; auto filter_expr = [&]() -> std::unique_ptr { auto stream = ctx->br()->stream_pool().get_stream(); auto owner = new std::vector; @@ -705,6 +707,7 @@ int main(int argc, char** argv) { timings.push_back(pipeline.count()); timings.push_back(compute.count()); ctx->comm()->logger().print(ctx->statistics()->report()); + ctx->statistics()->clear(); } if (ctx->comm()->rank() == 0) { for (int i = 0; i < arguments.num_iterations; i++) { diff --git a/cpp/benchmarks/streaming/ndsh/q09.cpp b/cpp/benchmarks/streaming/ndsh/q09.cpp index 8c02ab24a..563d97c09 100644 --- a/cpp/benchmarks/streaming/ndsh/q09.cpp +++ b/cpp/benchmarks/streaming/ndsh/q09.cpp @@ -625,6 +625,7 @@ int main(int argc, char** argv) { timings.push_back(pipeline.count()); timings.push_back(compute.count()); ctx->comm()->logger().print(ctx->statistics()->report()); + ctx->statistics()->clear(); } if (ctx->comm()->rank() == 0) { for (int i = 0; i < arguments.num_iterations; i++) { From 48d8fd02b0c375c6121b1fd83b1c2f9e347b9f73 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Dec 2025 11:36:53 +0000 Subject: [PATCH 35/75] cmake format --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index dc00fc8c9..54b30999c 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -13,8 +13,10 @@ if(NOT RAPIDSMPF_HAVE_STREAMING) message(FATAL_ERROR "Streaming NDSH benchmarks require streaming support") endif() -add_library(rapidsmpfndsh bloom_filter.cpp bloom_filter_impl.cu concatenate.cpp groupby.cpp join.cpp - parquet_writer.cpp sort.cpp utils.cpp) +add_library( + rapidsmpfndsh bloom_filter.cpp bloom_filter_impl.cu concatenate.cpp groupby.cpp join.cpp + parquet_writer.cpp sort.cpp utils.cpp +) set_target_properties( rapidsmpfndsh From 89ef19cf38dc6e945334a1199f2ea64d76b07114 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Dec 2025 11:49:57 +0000 Subject: [PATCH 36/75] Loop in cmake --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 87 ++++++-------------- 1 file changed, 23 insertions(+), 64 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 54b30999c..fb50a8358 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -39,57 +39,28 @@ target_link_libraries( $ maybe_asan ) -add_executable(q01 "q01.cpp") -set_target_properties( - q01 - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON -) -target_compile_options( - q01 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" - "$<$:${RAPIDSMPF_CUDA_FLAGS}>" -) -target_link_libraries( - q01 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ - $ maybe_asan -) -add_executable(q03 "q03.cpp") -set_target_properties( - q03 - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON -) -target_compile_options( - q03 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" - "$<$:${RAPIDSMPF_CUDA_FLAGS}>" -) -target_link_libraries( - q03 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ - $ maybe_asan -) -add_executable(q09 "q09.cpp") -set_target_properties( - q09 - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON -) -target_compile_options( - q09 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" - "$<$:${RAPIDSMPF_CUDA_FLAGS}>" -) -target_link_libraries( - q09 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ - $ maybe_asan -) + +set(RAPIDSMPFNDSH_QUERIES q01 q03 q09) + +foreach(query IN ITEMS ${RAPIDSMPFNDSH_QUERIES}) + add_executable(${query} "${query}.cpp") + set_target_properties( + ${query} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options( + ${query} PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" + "$<$:${RAPIDSMPF_CUDA_FLAGS}>" + ) + target_link_libraries( + ${query} PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ + $ maybe_asan + ) +endforeach() install( TARGETS rapidsmpfndsh @@ -98,19 +69,7 @@ install( EXCLUDE_FROM_ALL ) install( - TARGETS q01 - COMPONENT benchmarking - DESTINATION bin/benchmarks/librapidsmpf - EXCLUDE_FROM_ALL -) -install( - TARGETS q03 - COMPONENT benchmarking - DESTINATION bin/benchmarks/librapidsmpf - EXCLUDE_FROM_ALL -) -install( - TARGETS q09 + TARGETS ${RAPIDSMPFNDSH_QUERIES} COMPONENT benchmarking DESTINATION bin/benchmarks/librapidsmpf EXCLUDE_FROM_ALL From 39550abaa15bbc8590db7576ff82ebab0e6552bc Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Dec 2025 11:51:37 +0000 Subject: [PATCH 37/75] event_loop range only in verbose mode --- cpp/src/allgather/allgather.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/allgather/allgather.cpp b/cpp/src/allgather/allgather.cpp index 6564ea283..3959d6815 100644 --- a/cpp/src/allgather/allgather.cpp +++ b/cpp/src/allgather/allgather.cpp @@ -404,7 +404,7 @@ AllGather::AllGather( } ProgressThread::ProgressState AllGather::event_loop() { - RAPIDSMPF_NVTX_SCOPED_RANGE("AllGather::event_loop"); + RAPIDSMPF_NVTX_SCOPED_RANGE_VERBOSE("AllGather::event_loop"); /* * Data flow: * User inserts into inserted_ From 6c96b17b6d27af82b54514c89406a0cac96abcd1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 12 Dec 2025 11:05:00 -0800 Subject: [PATCH 38/75] Avoid RAPIDSMPF_FUNC_RANGE macro in .cu file This macro relies on `__VA_ARGS__`. Apparently nvcc handles this macro differently / incorrectly, so we avoid it. Instead, we use NVTX3_FUNC_RANGE_IN(rapidsmpf_domain) directly. --- cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu index 31a0c3736..25fedfea2 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -66,7 +66,7 @@ void BloomFilter::add( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - RAPIDSMPF_NVTX_FUNC_RANGE(); + NVTX3_FUNC_RANGE_IN(rapidsmpf_domain); auto filter_ref = BloomFilterRefType{ static_cast(storage_.data), num_blocks_, @@ -83,7 +83,7 @@ void BloomFilter::add( } void BloomFilter::merge(BloomFilter const& other, rmm::cuda_stream_view stream) { - RAPIDSMPF_NVTX_FUNC_RANGE(); + NVTX3_FUNC_RANGE_IN(rapidsmpf_domain); RAPIDSMPF_EXPECTS( num_blocks_ == other.num_blocks_, "Mismatching number of blocks in filters" ); @@ -116,7 +116,7 @@ rmm::device_uvector BloomFilter::contains( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - RAPIDSMPF_NVTX_FUNC_RANGE(); + NVTX3_FUNC_RANGE_IN(rapidsmpf_domain); auto filter_ref = BloomFilterRefType{ static_cast(storage_.data), num_blocks_, From 74f36a842f5c0fdc25d7025293c72e0ee335ec77 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 Dec 2025 10:46:23 +0000 Subject: [PATCH 39/75] Remove GNUism in RAPIDSMPF_NVTX_FUNC_RANGE Rather than using the GNU-extension ##__VA_ARGS__ to swallow a comma, use __VA_OPT__. --- cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu | 6 +++--- cpp/include/rapidsmpf/nvtx.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu index 25fedfea2..31a0c3736 100644 --- a/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu +++ b/cpp/benchmarks/streaming/ndsh/bloom_filter_impl.cu @@ -66,7 +66,7 @@ void BloomFilter::add( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - NVTX3_FUNC_RANGE_IN(rapidsmpf_domain); + RAPIDSMPF_NVTX_FUNC_RANGE(); auto filter_ref = BloomFilterRefType{ static_cast(storage_.data), num_blocks_, @@ -83,7 +83,7 @@ void BloomFilter::add( } void BloomFilter::merge(BloomFilter const& other, rmm::cuda_stream_view stream) { - NVTX3_FUNC_RANGE_IN(rapidsmpf_domain); + RAPIDSMPF_NVTX_FUNC_RANGE(); RAPIDSMPF_EXPECTS( num_blocks_ == other.num_blocks_, "Mismatching number of blocks in filters" ); @@ -116,7 +116,7 @@ rmm::device_uvector BloomFilter::contains( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr ) { - NVTX3_FUNC_RANGE_IN(rapidsmpf_domain); + RAPIDSMPF_NVTX_FUNC_RANGE(); auto filter_ref = BloomFilterRefType{ static_cast(storage_.data), num_blocks_, diff --git a/cpp/include/rapidsmpf/nvtx.hpp b/cpp/include/rapidsmpf/nvtx.hpp index 95eebb897..42a8b84fd 100644 --- a/cpp/include/rapidsmpf/nvtx.hpp +++ b/cpp/include/rapidsmpf/nvtx.hpp @@ -76,9 +76,9 @@ struct rapidsmpf_domain { #define RAPIDSMPF_GET_MACRO_FUNC(_0, _1, NAME, ...) NAME // unwrap the arguments and call the appropriate macro -#define RAPIDSMPF_NVTX_FUNC_RANGE_IMPL(...) \ - RAPIDSMPF_GET_MACRO_FUNC(dummy, ##__VA_ARGS__, RAPIDSMPF_NVTX_FUNC_RANGE_IMPL_WITH_VAL, RAPIDSMPF_NVTX_FUNC_RANGE_IMPL_WITHOUT_VAL)( \ - __VA_ARGS__ \ +#define RAPIDSMPF_NVTX_FUNC_RANGE_IMPL(...) \ + RAPIDSMPF_GET_MACRO_FUNC(dummy __VA_OPT__(, ) __VA_ARGS__, RAPIDSMPF_NVTX_FUNC_RANGE_IMPL_WITH_VAL, RAPIDSMPF_NVTX_FUNC_RANGE_IMPL_WITHOUT_VAL)( \ + __VA_ARGS__ \ ) /** From 4d5a6aaa3a1d1b22edb465af91099511c84ff498 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 Dec 2025 10:53:12 +0000 Subject: [PATCH 40/75] Finalize MPI with RAII --- cpp/benchmarks/streaming/ndsh/q01.cpp | 4 +--- cpp/benchmarks/streaming/ndsh/q09.cpp | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index 50fdfcd72..3a245f42d 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -316,6 +316,7 @@ static __device__ void calculate_charge(double *charge, double discprice, double * @endcode{} */ int main(int argc, char** argv) { + rapidsmpf::ndsh::FinalizeMPI finalize{}; cudaFree(nullptr); // work around https://github.com/rapidsai/cudf/issues/20849 cudf::initialize(); @@ -459,8 +460,5 @@ int main(int argc, char** argv) { ); } } - if (rapidsmpf::mpi::is_initialized()) { - RAPIDSMPF_MPI(MPI_Finalize()); - } return 0; } diff --git a/cpp/benchmarks/streaming/ndsh/q09.cpp b/cpp/benchmarks/streaming/ndsh/q09.cpp index 226b320f9..ea4611785 100644 --- a/cpp/benchmarks/streaming/ndsh/q09.cpp +++ b/cpp/benchmarks/streaming/ndsh/q09.cpp @@ -359,6 +359,7 @@ rapidsmpf::streaming::Node round_sum_profit( * @endcode{} */ int main(int argc, char** argv) { + rapidsmpf::ndsh::FinalizeMPI finalize{}; cudaFree(nullptr); // work around https://github.com/rapidsai/cudf/issues/20849 cudf::initialize(); @@ -641,8 +642,5 @@ int main(int argc, char** argv) { ); } } - if (rapidsmpf::mpi::is_initialized()) { - RAPIDSMPF_MPI(MPI_Finalize()); - } return 0; } From 483765becfdf1aec090cd97aa8af62730c7793a0 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 Dec 2025 10:59:22 +0000 Subject: [PATCH 41/75] cmake-format --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index fb50a8358..40477f5dd 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -39,7 +39,6 @@ target_link_libraries( $ maybe_asan ) - set(RAPIDSMPFNDSH_QUERIES q01 q03 q09) foreach(query IN ITEMS ${RAPIDSMPFNDSH_QUERIES}) From 09c0cc227c50da78a7206109c6bfeaef88fcc0da Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 Dec 2025 11:08:13 +0000 Subject: [PATCH 42/75] Fixes --- cpp/benchmarks/streaming/ndsh/groupby.hpp | 2 +- cpp/benchmarks/streaming/ndsh/q01.cpp | 2 ++ cpp/benchmarks/streaming/ndsh/q03.cpp | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/groupby.hpp b/cpp/benchmarks/streaming/ndsh/groupby.hpp index 46eb24b50..9416e427a 100644 --- a/cpp/benchmarks/streaming/ndsh/groupby.hpp +++ b/cpp/benchmarks/streaming/ndsh/groupby.hpp @@ -27,7 +27,7 @@ struct groupby_request { /** * @brief Perform a chunkwise grouped aggregation. * - * @note Grouped chunks are not futher grouped together. + * @note Grouped chunks are not further grouped together. * * @param ctx Streaming context. * @param ch_in `TableChunk`s to aggregate diff --git a/cpp/benchmarks/streaming/ndsh/q01.cpp b/cpp/benchmarks/streaming/ndsh/q01.cpp index 3a245f42d..d2adbf029 100644 --- a/cpp/benchmarks/streaming/ndsh/q01.cpp +++ b/cpp/benchmarks/streaming/ndsh/q01.cpp @@ -253,6 +253,7 @@ static __device__ void calculate_charge(double *charge, double discprice, double false, std::nullopt, cudf::null_aware::NO, + cudf::output_nullability::PRESERVE, chunk_stream, ctx->br()->device_mr() ) @@ -266,6 +267,7 @@ static __device__ void calculate_charge(double *charge, double discprice, double false, std::nullopt, cudf::null_aware::NO, + cudf::output_nullability::PRESERVE, chunk_stream, ctx->br()->device_mr() ) diff --git a/cpp/benchmarks/streaming/ndsh/q03.cpp b/cpp/benchmarks/streaming/ndsh/q03.cpp index 82408d585..f28eebdac 100644 --- a/cpp/benchmarks/streaming/ndsh/q03.cpp +++ b/cpp/benchmarks/streaming/ndsh/q03.cpp @@ -304,6 +304,7 @@ static __device__ void calculate_revenue(double *revenue, double extprice, doubl false, std::nullopt, cudf::null_aware::NO, + cudf::output_nullability::PRESERVE, chunk_stream, ctx->br()->device_mr() ) From b81e2c2f9a8f8d05ca93303ae70f2cd3f3bf7f56 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 Dec 2025 13:58:12 +0000 Subject: [PATCH 43/75] More fixes --- cpp/benchmarks/streaming/ndsh/utils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 3c09ac952..12880f29b 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -134,7 +135,7 @@ std::shared_ptr create_context( auto br = std::make_shared( mr, - BufferResource::PinnedMemoryResourceDisabled, + PinnedMemoryResource::Disabled, std::move(memory_available), arguments.periodic_spill, std::make_shared( From c415379302d35ce7b2b33393d4057bdafed77416 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 15 Dec 2025 16:09:06 +0000 Subject: [PATCH 44/75] Add docstring in cmake Co-authored-by: Tom Augspurger --- cmake/thirdparty/get_cuco.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/thirdparty/get_cuco.cmake b/cmake/thirdparty/get_cuco.cmake index 1b0b6f7f9..54a5220f6 100644 --- a/cmake/thirdparty/get_cuco.cmake +++ b/cmake/thirdparty/get_cuco.cmake @@ -5,6 +5,7 @@ # cmake-format: on # ============================================================================= +# This function finds cuco and sets any additional necessary environment variables. function(find_and_configure_cucollections) include(${rapids-cmake-dir}/cpm/cuco.cmake) From cc7e5e081b5d0ec9c8760004b1ef31542c8f09ed Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 26 Nov 2025 13:00:26 -0800 Subject: [PATCH 45/75] WIP: Streaming Q4 implementation --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 38 +- cpp/benchmarks/streaming/ndsh/join.cpp | 170 +++ cpp/benchmarks/streaming/ndsh/join.hpp | 59 + cpp/benchmarks/streaming/ndsh/q04.cpp | 1033 ++++++++++++++++++ cpp/benchmarks/streaming/ndsh/q09.cpp | 23 +- cpp/benchmarks/streaming/ndsh/utils.cpp | 39 + cpp/benchmarks/streaming/ndsh/utils.hpp | 20 +- 7 files changed, 1346 insertions(+), 36 deletions(-) create mode 100644 cpp/benchmarks/streaming/ndsh/q04.cpp diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 6fa4bd27b..05c1af5cd 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -36,23 +36,25 @@ target_link_libraries( $ maybe_asan ) -add_executable(q09 "q09.cpp") -set_target_properties( - q09 - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON -) -target_compile_options( - q09 PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" - "$<$:${RAPIDSMPF_CUDA_FLAGS}>" -) -target_link_libraries( - q09 PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ - $ maybe_asan -) +foreach(query IN ITEMS q04 q09) + add_executable(${query} "${query}.cpp") + set_target_properties( + ${query} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_compile_options( + ${query} PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" + "$<$:${RAPIDSMPF_CUDA_FLAGS}>" + ) + target_link_libraries( + ${query} PRIVATE rapidsmpfndsh rapidsmpf::rapidsmpf $ + $ maybe_asan + ) +endforeach() install( TARGETS rapidsmpfndsh COMPONENT benchmarking @@ -60,7 +62,7 @@ install( EXCLUDE_FROM_ALL ) install( - TARGETS q09 + TARGETS q04 q09 COMPONENT benchmarking DESTINATION bin/benchmarks/librapidsmpf EXCLUDE_FROM_ALL diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 271b6e5c0..2b1a6a7e7 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -145,6 +146,73 @@ coro::task broadcast( } } +/** + * @brief Join a table chunk against a build hash table returning a message of the result. + * + * @param ctx Streaming context + * @param left_chunk Chunk to join. Used as the probe table in a filtered join. + * @param right_chunk Chunk to join. Used as the build table in a filtered join. + * @param right_on Key column indices in `left_chunk`. + * @param right_on Key column indices in `right_chunk`. + * @param sequence Sequence number of the output + * + * @return Message of `TableChunk` containing the result of the inner join. + */ +streaming::Message semi_join_chunk( + std::shared_ptr ctx, + streaming::TableChunk const& left_chunk, + streaming::TableChunk&& right_chunk, + [[maybe_unused]] std::vector left_on, + std::vector right_on, + std::uint64_t sequence +) { + CudaEvent event; + right_chunk = to_device(ctx, std::move(right_chunk)); + + auto joiner = cudf::filtered_join( + right_chunk.table_view().select(right_on), + cudf::null_equality::UNEQUAL, + cudf::set_as_build_table::RIGHT, + left_chunk.stream() + ); + + // We need data to be ready on both left and right sides of the table, + // so the `semi_join` must be on a stream that's downstream of both left and right. + CudaEvent build_event; + build_event.record(left_chunk.stream()); // build_event downstream of left + + auto chunk_stream = right_chunk.stream(); + build_event.stream_wait(chunk_stream); // build_event downstream of right + + auto match = joiner.semi_join( + left_chunk.table_view().select(left_on), chunk_stream, ctx->br()->device_mr() + ); + + ctx->comm()->logger().debug( + "semi_join_chunk: left.num_rows()=", left_chunk.table_view().num_rows() + ); + ctx->comm()->logger().debug("semi_join_chunk: match.size()=", match->size()); + + cudf::column_view indices = cudf::device_span(*match); + auto result_columns = cudf::gather( + left_chunk.table_view(), + indices, + cudf::out_of_bounds_policy::DONT_CHECK, + chunk_stream, + ctx->br()->device_mr() + ) + ->release(); + + auto result_table = std::make_unique(std::move(result_columns)); + ctx->comm()->logger().debug( + "semi_join_chunk: result_table.num_rows()=", result_table->num_rows() + ); + return streaming::to_message( + sequence, + std::make_unique(std::move(result_table), chunk_stream) + ); +} + /** * @brief Join a table chunk against a build hash table returning a message of the result. * @@ -347,6 +415,107 @@ streaming::Node inner_join_shuffle( co_await ch_out->drain(ctx->executor()); } +/** + * @brief Perform a left semi join between two tables, broadcasting the left table to all + * ranks. + * + * @param ctx Streaming context + * @param left Channel of `TableChunk`s used as the broadcasted build side. This table is + * broadcasted to all ranks. + * @param right Channel of `TableChunk`s joined in turn against the build side. This table + * is required to be shuffled / hash-partitioned. + * @param ch_out Output channel of `TableChunk`s. + * @param left_on Column indices of the keys in the left table. + * @param right_on Column indices of the keys in the right table. + * @param tag Disambiguating tag for the broadcast of the left table. + * @param keep_keys Does the result contain the key columns, or only "carrier" value + * columns + * @return Coroutine representing the completion of the join. + * + * @note This implementation assumes that: + * - `left` is small and fits in memory + * - `right` is shuffled / hash-partitioned + * It doesn't implement build table reuse across chunks of `left`, because we assume that + * `right` is too large to broadcast. + */ +streaming::Node left_semi_join_broadcast_left( + std::shared_ptr ctx, + std::shared_ptr left, + std::shared_ptr right, + std::shared_ptr ch_out, + std::vector left_on, + std::vector right_on, + OpID tag, + [[maybe_unused]] KeepKeys keep_keys +) { + streaming::ShutdownAtExit c{left, right, ch_out}; + co_await ctx->executor()->schedule(); + ctx->comm()->logger().print("Inner broadcast join ", static_cast(tag)); + auto left_table = to_device( + ctx, (co_await broadcast(ctx, left, tag)).release() + ); + ctx->comm()->logger().print( + "Left (probe) table has ", left_table.table_view().num_rows(), " rows" + ); + + std::size_t sequence = 0; + while (true) { + auto right_msg = co_await right->receive(); + if (right_msg.empty()) { + break; + } + co_await ch_out->send(semi_join_chunk( + ctx, + left_table, + right_msg.release(), + left_on, + right_on, + sequence++ + )); + } + + co_await ch_out->drain(ctx->executor()); +} + +streaming::Node left_semi_join_shuffle( + std::shared_ptr ctx, + std::shared_ptr left, + std::shared_ptr right, + std::shared_ptr ch_out, + std::vector left_on, + std::vector right_on +) { + streaming::ShutdownAtExit c{left, right, ch_out}; + ctx->comm()->logger().print("Shuffle left semi join"); + + co_await ctx->executor()->schedule(); + + while (true) { + // Requirement: two shuffles kick out partitions in the same order + auto left_msg = co_await left->receive(); + auto right_msg = co_await right->receive(); + if (left_msg.empty()) { + RAPIDSMPF_EXPECTS( + right_msg.empty(), "Left does not have same number of partitions as right" + ); + break; + } + RAPIDSMPF_EXPECTS( + left_msg.sequence_number() == right_msg.sequence_number(), + "Mismatching sequence numbers" + ); + + co_await ch_out->send(semi_join_chunk( + ctx, + left_msg.release(), + right_msg.release(), + left_on, + right_on, + left_msg.sequence_number() + )); + } +} + streaming::Node shuffle( std::shared_ptr ctx, std::shared_ptr ch_in, @@ -362,6 +531,7 @@ streaming::Node shuffle( while (true) { auto msg = co_await ch_in->receive(); if (msg.empty()) { + ctx->comm()->logger().print("Shuffle: no more input"); break; } auto chunk = to_device(ctx, msg.release()); diff --git a/cpp/benchmarks/streaming/ndsh/join.hpp b/cpp/benchmarks/streaming/ndsh/join.hpp index ddd799112..fd221faef 100644 --- a/cpp/benchmarks/streaming/ndsh/join.hpp +++ b/cpp/benchmarks/streaming/ndsh/join.hpp @@ -77,6 +77,65 @@ streaming::Node inner_join_shuffle( KeepKeys keep_keys = KeepKeys::YES ); +/** + * @brief Perform a streaming left semi join between two tables. + * + * @note This performs a broadcast join, broadcasting the table represented by the `left` + * channel to all ranks, and then streaming through the chunks of the `right` channel. + * The `right` channel is required to provide hash-partitioned data in-order. + * + * @param ctx Streaming context. + * @param left Channel of `TableChunk`s in hash-partitioned order. + * @param right Channel of `TableChunk`s in matching hash-partitioned order. + * @param ch_out Output channel of `TableChunk`s. + * @param left_on Column indices of the keys in the left table. + * @param right_on Column indices of the keys in the right table. + * @param tag Disambiguating tag for the broadcast of the left table. + * @param keep_keys Does the result contain the key columns, or only "carrier" value + * columns + * + * @return Coroutine representing the completion of the join. + */ +streaming::Node left_semi_join_broadcast_left( + std::shared_ptr ctx, + // We will always choose left as build table and do "broadcast" joins + std::shared_ptr left, + std::shared_ptr right, + std::shared_ptr ch_out, + std::vector left_on, + std::vector right_on, + OpID tag, + KeepKeys keep_keys +); + +/** + * @brief Perform a streaming left semi join between two tables. + * + * @note This performs a shuffle join, the left and right channels are required to provide + * hash-partitioned data in-order. + * + * @param ctx Streaming context. + * @param left Channel of `TableChunk`s in hash-partitioned order. + * @param right Channel of `TableChunk`s in matching hash-partitioned order. + * @param ch_out Output channel of `TableChunk`s. + * @param left_on Column indices of the keys in the left table. + * @param right_on Column indices of the keys in the right table. + * @param tag Disambiguating tag for the broadcast of the left table. + * @param keep_keys Does the result contain the key columns, or only "carrier" value + * columns + * + * @return Coroutine representing the completion of the join. + */ + +streaming::Node left_semi_join_shuffle( + std::shared_ptr ctx, + std::shared_ptr left, + std::shared_ptr right, + std::shared_ptr ch_out, + std::vector left_on, + std::vector right_on +); + /** * @brief Shuffle the input channel by hash-partitioning on given key columns. * diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp new file mode 100644 index 000000000..be5788844 --- /dev/null +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -0,0 +1,1033 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "concatenate.hpp" +#include "join.hpp" +#include "utils.hpp" + +/* Query 4 from the TPC-H benchmark. + +This performs a left semi join between the orders and lineitem tables, +followed by a grouped count aggregation on a low-cardinality column. + +```python +lineitem = pl.scan_parquet("/raid/rapidsmpf/data/tpch/scale-100.0/lineitem.parquet") +orders = pl.scan_parquet("/raid/rapidsmpf/data/tpch/scale-100.0/orders.parquet") + +var1 = date(1993, 7, 1) # 8582 +var2 = date(1993, 10, 1) # 8674 + +q = ( + # SQL exists translates to semi join in Polars API + orders.join( + (lineitem.filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))), + left_on="o_orderkey", + right_on="l_orderkey", + how="semi", + ) + .filter(pl.col("o_orderdate").is_between(var1, var2, closed="left")) + .group_by("o_orderpriority") + .agg(pl.len().alias("order_count")) + .sort("o_orderpriority") +) +``` + +Some rough stats at SF-100: + +| Scale Factor | Table / Stage | Row Count | Percent of prior | +| ------------ | ----------------- | ----------- | ---------------- | +| 100 | lineitem | 600,037,902 | - | +| 100 | lineitem-filtered | 379,356,474 | 63% | +| 100 | orders | 150,000,000 | - | +| 100 | orders-filtered | 5,733,776 | 3.8% | +| 100 | joined | 5,257,429 | 91% / 1.4% | +| 100 | groupby | 5 | 0.0% | +| 100 | final | 5 | 100% | +| 100 | sorted | 5 | 100% | + +So the lineitem filter is somewhat selective, the orders filter is very +selective, the join is a bit selective (of orders), and the final groupby +reduces by a lot. + +The left-semi join can be performed in one of two ways: + +1. Broadcast `orders` to all ranks, shuffle `lineitem`, join per chunk, concat. +2. Shuffle `orders` and `lineitem`, join per chunk, concat + +Either way, we *always* shuffle / hash-partition `lineitem` before the join. +We rely on that has partitioning to ensure that the chunkwise left-semi join +is correct (notably, how duplicates are handled). + +We don't attempt to reuse the build table (`lineitem`) in the hash partition +for multiple probe table (`orders`) chunks. That would require broadcasting +`lineitem`, which we assume is too large. +*/ + +namespace { + +/* Select the columns after the join + +Input table: + +- o_orderkey +- o_orderpriority + +*/ +rapidsmpf::streaming::Node select_columns( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + std::vector indices +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + + while (!ch_out->is_shutdown()) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + ctx->comm()->logger().debug("Select columns: no more input"); + break; + } + co_await ctx->executor()->schedule(); + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto sequence_number = msg.sequence_number(); + auto table = chunk.table_view(); + + rapidsmpf::ndsh::detail::debug_print_table(ctx, table, "select_columns::input"); + std::vector> result; + result.reserve(indices.size()); + for (auto idx : indices) { + result.push_back( + std::make_unique( + table.column(idx), chunk_stream, ctx->br()->device_mr() + ) + ); + } + + auto result_table = std::make_unique(std::move(result)); + + rapidsmpf::ndsh::detail::debug_print_table( + ctx, result_table->view(), "select_columns::output" + ); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence_number, + std::make_unique( + std::move(result_table), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +/* +Read the lineitem table. + +Output table: + + - l_commitdate + - l_receiptdate + - l_orderkey +*/ +rapidsmpf::streaming::Node read_lineitem( + std::shared_ptr ctx, + std::shared_ptr ch_out, + std::size_t num_producers, + cudf::size_type num_rows_per_chunk, + std::string const& input_directory +) { + auto files = rapidsmpf::ndsh::detail::list_parquet_files( + rapidsmpf::ndsh::detail::get_table_path(input_directory, "lineitem") + ); + auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) + .columns({ + "l_commitdate", // used in filter + "l_receiptdate", // used in filter + "l_orderkey", // used in join + }) + .build(); + + return rapidsmpf::streaming::node::read_parquet( + ctx, ch_out, num_producers, options, num_rows_per_chunk + ); +} + +/* +Read the orders table, including the filter on the o_orderdate column. + +Output table: + + - o_orderkey + - o_orderpriority +*/ +rapidsmpf::streaming::Node read_orders( + std::shared_ptr ctx, + std::shared_ptr ch_out, + std::size_t num_producers, + cudf::size_type num_rows_per_chunk, + std::string const& input_directory +) { + auto files = rapidsmpf::ndsh::detail::list_parquet_files( + rapidsmpf::ndsh::detail::get_table_path(input_directory, "orders") + ); + auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) + .columns({ + "o_orderkey", // used in join + "o_orderpriority", // used in group by + }) + .build(); + + // Build the filter expression 1993-07-01 <= o_orderdate < 1993-10-01 + cudf::timestamp_ms ts1{ + cuda::std::chrono::duration_cast( + cuda::std::chrono::sys_days( + cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1993), + cuda::std::chrono::month(7), + cuda::std::chrono::day(1) + ) + ) + .time_since_epoch() + ) + }; + cudf::timestamp_ms ts2{ + cuda::std::chrono::duration_cast( + cuda::std::chrono::sys_days( + cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1993), + cuda::std::chrono::month(10), + cuda::std::chrono::day(1) + ) + ) + .time_since_epoch() + ) + }; + + /* This vector will have the references for the expression `a < column < b` as + + 0: column_reference to o_orderdate + 1: scalar + 2: scalar + 3: literal + 4: literal + 5: operation GE + 6: operation LT + 7: operation AND + */ + + auto owner = new std::vector; + auto filter_stream = ctx->br()->stream_pool().get_stream(); + // 0 + owner->push_back( + std::make_shared( + "o_orderdate" + ) // position in the table + ); + + + // 1, 2: Scalars + owner->push_back( + std::make_shared>( + ts1, true, filter_stream + ) + ); + owner->push_back( + std::make_shared>( + ts2, true, filter_stream + ) + ); + + // 3, 4: Literals + owner->push_back( + std::make_shared( + *std::any_cast>>( + owner->at(1) + ) + ) + ); + owner->push_back( + std::make_shared( + *std::any_cast>>( + owner->at(2) + ) + ) + ); + + // 5: (GE, column, literal) + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::GREATER_EQUAL, + *std::any_cast>( + owner->at(0) + ), + *std::any_cast>(owner->at(3)) + ) + ); + + // 6 (LT, column, literal) + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::LESS, + *std::any_cast>( + owner->at(0) + ), + *std::any_cast>(owner->at(4)) + ) + ); + + // 7 (AND, GE, LT) + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::LOGICAL_AND, + *std::any_cast>(owner->at(5)), + *std::any_cast>(owner->at(6)) + ) + ); + + auto filter = std::make_unique( + filter_stream, + *std::any_cast>(owner->back()), + rapidsmpf::OwningWrapper(static_cast(owner), [](void* p) { + delete static_cast*>(p); + }) + ); + + return rapidsmpf::streaming::node::read_parquet( + ctx, ch_out, num_producers, options, num_rows_per_chunk, std::move(filter) + ); +} + +/* Filter the lineitem table. + +Input table: + + - l_commitdate + - l_receiptdate + - l_orderkey + +Output table: + + - l_orderkey +*/ +rapidsmpf::streaming::Node filter_lineitem( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + auto mr = ctx->br()->device_mr(); + while (!ch_out->is_shutdown()) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + co_await ctx->executor()->schedule(); + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + + rapidsmpf::ndsh::detail::debug_print_table(ctx, table, "lineitem"); + + auto l_commitdate = table.column(0); + auto l_receiptdate = table.column(1); + auto mask = cudf::binary_operation( + l_commitdate, + l_receiptdate, + cudf::binary_operator::LESS, + cudf::data_type(cudf::type_id::BOOL8), + chunk_stream, + mr + ); + auto filtered_table = + cudf::apply_boolean_mask(table.select({2}), mask->view(), chunk_stream, mr); + rapidsmpf::ndsh::detail::debug_print_table( + ctx, filtered_table->view(), "filtered_lineitem" + ); + co_await ch_out->send( + rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::move(filtered_table), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +/* Group the joined orders table by o_orderpriority (chunk). + +We're performing a `.group_by(...).count()`, so the chunk-stage +is just a count. + +Input table: + + - o_orderkey + - o_orderpriority + +Output table: + + - o_orderpriority + - order_count +*/ +rapidsmpf::streaming::Node chunkwise_groupby_agg( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + std::vector partial_results; + std::uint64_t sequence = 0; + while (!ch_out->is_shutdown()) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + ctx->comm()->logger().debug("Chunkwise groupby agg: no more input"); + break; + } + co_await ctx->executor()->schedule(); + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + + rapidsmpf::ndsh::detail::debug_print_table( + ctx, table, "chunkwise_groupby_agg::input" + ); + + auto grouper = cudf::groupby::groupby( + table.select({0}), cudf::null_policy::EXCLUDE, cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_count_aggregation()); + requests.push_back( + cudf::groupby::aggregation_request(table.column(0), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + + auto result_table = std::make_unique(std::move(result)); + rapidsmpf::ndsh::detail::debug_print_table( + ctx, result_table->view(), "chunkwise_groupby_agg::output" + ); + + + co_await ch_out->send( + rapidsmpf::streaming::to_message( + sequence++, + std::make_unique( + std::move(result_table), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +/* Group the joined orders table by o_orderpriority (final). + +We're performing a `.group_by(...).count()`, so the final stage +is just a sum. + +Input table: + + - o_orderkey + - o_orderpriority + +Output table: + + - o_orderpriority + - order_count +*/ + +rapidsmpf::streaming::Node final_groupby_agg( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out, + rapidsmpf::OpID tag +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + auto next = co_await ch_in->receive(); + ctx->comm()->logger().debug("Final groupby"); + RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto chunk_stream = chunk.stream(); + auto table = chunk.table_view(); + + rapidsmpf::ndsh::detail::debug_print_table(ctx, table, "final_groupby_agg::input"); + std::unique_ptr local_result{nullptr}; + if (!table.is_empty()) { + auto grouper = cudf::groupby::groupby( + table.select({0}), cudf::null_policy::EXCLUDE, cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + + requests.push_back( + cudf::groupby::aggregation_request(table.column(1), std::move(aggs)) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + // Drop chunk, we don't need it. + std::ignore = std::move(chunk); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + local_result = std::make_unique(std::move(result)); + } + if (ctx->comm()->nranks() > 1) { + // Reduce across ranks... + // Need a reduce primitive in rapidsmpf, but let's just use an allgather and + // discard for now. + rapidsmpf::streaming::AllGather gatherer{ctx, tag}; + if (local_result) { + auto pack = + cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); + gatherer.insert( + 0, + {rapidsmpf::PackedData( + std::move(pack.metadata), + ctx->br()->move(std::move(pack.gpu_data), chunk_stream) + )} + ); + } + gatherer.insert_finished(); + auto packed_data = + co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); + if (ctx->comm()->rank() == 0) { + auto global_result = rapidsmpf::unpack_and_concat( + rapidsmpf::unspill_partitions( + std::move(packed_data), ctx->br(), true, ctx->statistics() + ), + chunk_stream, + ctx->br(), + ctx->statistics() + ); + if (ctx->comm()->rank() == 0) { + // We will only actually bother to do this on rank zero. + auto result_view = global_result->view(); + auto grouper = cudf::groupby::groupby( + result_view.select({0}), cudf::null_policy::EXCLUDE, cudf::sorted::NO + ); + auto requests = std::vector(); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + requests.push_back( + cudf::groupby::aggregation_request( + result_view.column(1), std::move(aggs) + ) + ); + auto [keys, results] = + grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); + global_result.reset(); + auto result = keys->release(); + for (auto&& r : results) { + std::ranges::move(r.results, std::back_inserter(result)); + } + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::make_unique(std::move(result)), chunk_stream + ) + ) + ); + } + } else { + std::ignore = std::move(packed_data); + } + } else { + co_await ch_out->send( + rapidsmpf::streaming::to_message( + 0, + std::make_unique( + std::move(local_result), chunk_stream + ) + ) + ); + } + co_await ch_out->drain(ctx->executor()); +} + +/* Sort the grouped orders table. + +Input table: + + - o_orderpriority + - order_count + +Output table: + + - o_orderpriority + - order_count +*/ +rapidsmpf::streaming::Node sort_by( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + ctx->comm()->logger().debug("Final sortby"); + auto msg = co_await ch_in->receive(); + // We know we only have a single chunk from the groupby + if (msg.empty()) { + co_return; + } + ctx->comm()->logger().debug("Sortby"); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto table = chunk.table_view(); + rapidsmpf::ndsh::detail::debug_print_table(ctx, table, "sort_by::input"); + auto result = rapidsmpf::streaming::to_message( + 0, + std::make_unique( + cudf::sort_by_key( + table, + table.select({0, 1}), + {cudf::order::ASCENDING, cudf::order::DESCENDING}, + {cudf::null_order::BEFORE, cudf::null_order::BEFORE}, + chunk.stream(), + ctx->br()->device_mr() + ), + chunk.stream() + ) + ); + co_await ch_out->send(std::move(result)); + co_await ch_out->drain(ctx->executor()); +} + +rapidsmpf::streaming::Node write_parquet( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::string output_path +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + co_return; + } + ctx->comm()->logger().debug("write parquet"); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto sink = cudf::io::sink_info(output_path); + auto builder = cudf::io::parquet_writer_options::builder(sink, chunk.table_view()); + auto metadata = cudf::io::table_input_metadata(chunk.table_view()); + metadata.column_metadata[0].set_name("o_orderpriority"); + metadata.column_metadata[1].set_name("order_count"); + builder = builder.metadata(metadata); + auto options = builder.build(); + cudf::io::write_parquet(options, chunk.stream()); + ctx->comm()->logger().print( + "Wrote chunk with ", + chunk.table_view().num_rows(), + " rows and ", + chunk.table_view().num_columns(), + " columns to ", + output_path + ); +} + + +} // namespace + +struct ProgramOptions { + int num_streaming_threads{1}; + cudf::size_type num_rows_per_chunk{100'000'000}; + std::optional spill_device_limit{std::nullopt}; + bool use_shuffle_join = false; + std::string output_file; + std::string input_directory; + std::uint32_t num_partitions{16}; +}; + +// TODO: Refactor to common utilities +ProgramOptions parse_options(int argc, char** argv) { + ProgramOptions options; + + auto print_usage = [&argv]() { + std::cerr + << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " --num-streaming-threads Number of streaming threads (default: 1)\n" + << " --num-rows-per-chunk Number of rows per chunk (default: " + "100000000)\n" + << " --spill-device-limit Fractional spill device limit (default: " + "None)\n" + << " --use-shuffle-join Use shuffle join (default: false)\n" + << " --output-file Output file path (required)\n" + << " --input-directory Input directory path (required)\n" + << " --num-partitions Number of partitions (default: 16)\n" + << " --help Show this help message\n"; + }; + + static std::array long_options = {{ + {.name = "num-streaming-threads", + .has_arg = required_argument, + .flag = nullptr, + .val = 1}, + {.name = "num-rows-per-chunk", + .has_arg = required_argument, + .flag = nullptr, + .val = 2}, + {.name = "use-shuffle-join", .has_arg = no_argument, .flag = nullptr, .val = 3}, + {.name = "output-file", .has_arg = required_argument, .flag = nullptr, .val = 4}, + {.name = "input-directory", + .has_arg = required_argument, + .flag = nullptr, + .val = 5}, + {.name = "num-partitions", + .has_arg = required_argument, + .flag = nullptr, + .val = 6}, + {.name = "help", .has_arg = no_argument, .flag = nullptr, .val = 6}, + {.name = "spill-device-limit", + .has_arg = required_argument, + .flag = nullptr, + .val = 7}, + {.name = nullptr, .has_arg = 0, .flag = nullptr, .val = 0}, + }}; + + int opt; + int option_index = 0; + + bool saw_output_file = false; + bool saw_input_directory = false; + + while ((opt = getopt_long(argc, argv, "", long_options.data(), &option_index)) != -1) + { + switch (opt) { + case 1: + options.num_streaming_threads = std::atoi(optarg); + break; + case 2: + options.num_rows_per_chunk = std::atoi(optarg); + break; + case 3: + options.use_shuffle_join = true; + break; + case 4: + options.output_file = optarg; + saw_output_file = true; + break; + case 5: + options.input_directory = optarg; + saw_input_directory = true; + break; + case 6: + options.num_partitions = static_cast(std::atoi(optarg)); + break; + case 7: + print_usage(); + std::exit(0); + case 8: + options.spill_device_limit = std::stod(optarg); + break; + case '?': + if (optopt == 0 && optind > 1) { + std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; + } + print_usage(); + std::exit(1); + default: + print_usage(); + std::exit(1); + } + } + + // Check if required options were provided + if (!saw_output_file || !saw_input_directory) { + if (!saw_output_file) { + std::cerr << "Error: --output-file is required\n"; + } + if (!saw_input_directory) { + std::cerr << "Error: --input-directory is required\n"; + } + std::cerr << std::endl; + print_usage(); + std::exit(1); + } + + return options; +} + +int main(int argc, char** argv) { + cudaFree(nullptr); + rapidsmpf::mpi::init(&argc, &argv); + MPI_Comm mpi_comm; + RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); + auto cmd_options = parse_options(argc, argv); + auto limit_size = rmm::percent_of_free_device_memory( + static_cast(cmd_options.spill_device_limit.value_or(1) * 100) + ); + rmm::mr::cuda_async_memory_resource mr{}; + auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); + rmm::device_async_resource_ref mr_ref(stats_mr); + rmm::mr::set_current_device_resource(&stats_mr); + rmm::mr::set_current_device_resource_ref(mr_ref); + std::unordered_map + memory_available{}; + if (cmd_options.spill_device_limit.has_value()) { + memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ + &stats_mr, static_cast(limit_size) + }; + } + auto br = std::make_shared( + stats_mr, std::move(memory_available) + ); + auto envvars = rapidsmpf::config::get_environment_variables(); + envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); + auto options = rapidsmpf::config::Options(envvars); + auto stats = std::make_shared(&stats_mr); + { + auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); + auto progress = + std::make_shared(comm->logger(), stats); + auto ctx = + std::make_shared(options, comm, br, stats); + comm->logger().print( + "Executor has ", ctx->executor()->thread_count(), " threads" + ); + comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); + + std::string output_path = cmd_options.output_file; + std::vector timings; + for (int i = 0; i < 2; i++) { + rapidsmpf::OpID op_id{0}; + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q4 pipeline"); + // Convention for channel names: express the *output*. + /* Lineitem Table */ + // [l_commitdate, l_receiptdate, l_orderkey] + auto lineitem = ctx->create_channel(); + // [l_orderkey] + auto filtered_lineitem = ctx->create_channel(); + // [l_orderkey] + auto filtered_lineitem_shuffled = ctx->create_channel(); + + /* Orders Table */ + // [o_orderkey, o_orderpriority] + auto order = ctx->create_channel(); + + // [o_orderkey, o_orderpriority] + // Ideally this would *just* be o_orderpriority, pushing the projection + // into the join node / dropping the join key. + auto orders_x_lineitem = ctx->create_channel(); + + // [o_orderpriority] + auto projected_columns = ctx->create_channel(); + // [o_orderpriority, order_count] + auto grouped_chunkwise = ctx->create_channel(); + // [o_orderpriority, order_count] + auto grouped_concatenated = ctx->create_channel(); + // [o_orderpriority, order_count] + auto grouped_finalized = ctx->create_channel(); + // [o_orderpriority, order_count] + auto sorted = ctx->create_channel(); + + nodes.push_back(read_lineitem( + ctx, + lineitem, + 4, + cmd_options.num_rows_per_chunk, + cmd_options.input_directory + )); + nodes.push_back( + filter_lineitem(ctx, lineitem, filtered_lineitem) + ); // l_orderkey + nodes.push_back(read_orders( + ctx, + order, + 4, + cmd_options.num_rows_per_chunk, + cmd_options.input_directory + )); + // nodes.push_back(select_columns(ctx, order, projected_order, {1, 2})); + + nodes.push_back( + rapidsmpf::ndsh::shuffle( + ctx, + filtered_lineitem, + filtered_lineitem_shuffled, + {0}, + cmd_options.num_partitions, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + ) + ); + + if (cmd_options.use_shuffle_join) { + auto filtered_order_shuffled = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::shuffle( + ctx, + order, + filtered_order_shuffled, + {0}, + cmd_options.num_partitions, + rapidsmpf::OpID{ + static_cast(10 * i + op_id++) + } + ) + ); + + nodes.push_back( + rapidsmpf::ndsh::left_semi_join_shuffle( + ctx, + filtered_order_shuffled, + filtered_lineitem_shuffled, + orders_x_lineitem, + {0}, + {0} + ) + ); + } else { + nodes.push_back( + rapidsmpf::ndsh::left_semi_join_broadcast_left( + ctx, + order, + filtered_lineitem_shuffled, + orders_x_lineitem, + {0}, + {0}, + rapidsmpf::OpID{ + static_cast(10 * i + op_id++) + }, + rapidsmpf::ndsh::KeepKeys::YES + ) + ); + } + + nodes.push_back( + select_columns(ctx, orders_x_lineitem, projected_columns, {1}) + ); + + nodes.push_back( + chunkwise_groupby_agg(ctx, projected_columns, grouped_chunkwise) + ); + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, + grouped_chunkwise, + grouped_concatenated, + rapidsmpf::ndsh::ConcatOrder::DONT_CARE + ) + ); + nodes.push_back(final_groupby_agg( + ctx, + grouped_concatenated, + grouped_finalized, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + )); + nodes.push_back(sort_by(ctx, grouped_finalized, sorted)); + nodes.push_back(write_parquet(ctx, sorted, output_path)); + } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q4 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + comm->logger().print( + "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + ); + comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(stats->report()); + RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + } + if (comm->rank() == 0) { + for (int i = 0; i < 2; i++) { + comm->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + comm->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); + } + } + } + + RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); + RAPIDSMPF_MPI(MPI_Finalize()); + return 0; +} diff --git a/cpp/benchmarks/streaming/ndsh/q09.cpp b/cpp/benchmarks/streaming/ndsh/q09.cpp index e136eb3ce..85c5d4772 100644 --- a/cpp/benchmarks/streaming/ndsh/q09.cpp +++ b/cpp/benchmarks/streaming/ndsh/q09.cpp @@ -60,17 +60,6 @@ namespace { -std::string get_table_path( - std::string const& input_directory, std::string const& table_name -) { - auto dir = input_directory.empty() ? "." : input_directory; - auto file_path = dir + "/" + table_name + ".parquet"; - if (std::filesystem::exists(file_path)) { - return file_path; - } - return dir + "/" + table_name + "/"; -} - rapidsmpf::streaming::Node read_lineitem( std::shared_ptr ctx, std::shared_ptr ch_out, @@ -79,7 +68,7 @@ rapidsmpf::streaming::Node read_lineitem( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "lineitem") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "lineitem") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns( @@ -104,7 +93,7 @@ rapidsmpf::streaming::Node read_nation( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "nation") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "nation") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"n_name", "n_nationkey"}) @@ -122,7 +111,7 @@ rapidsmpf::streaming::Node read_orders( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "orders") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "orders") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"o_orderdate", "o_orderkey"}) @@ -140,7 +129,7 @@ rapidsmpf::streaming::Node read_part( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "part") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "part") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"p_partkey", "p_name"}) @@ -158,7 +147,7 @@ rapidsmpf::streaming::Node read_partsupp( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "partsupp") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "partsupp") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"ps_partkey", "ps_suppkey", "ps_supplycost"}) @@ -176,7 +165,7 @@ rapidsmpf::streaming::Node read_supplier( std::string const& input_directory ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( - get_table_path(input_directory, "supplier") + rapidsmpf::ndsh::detail::get_table_path(input_directory, "supplier") ); auto options = cudf::io::parquet_reader_options::builder(cudf::io::source_info(files)) .columns({"s_nationkey", "s_suppkey"}) diff --git a/cpp/benchmarks/streaming/ndsh/utils.cpp b/cpp/benchmarks/streaming/ndsh/utils.cpp index 3f1f177b4..41eb9a7ab 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.cpp +++ b/cpp/benchmarks/streaming/ndsh/utils.cpp @@ -47,6 +47,45 @@ std::vector list_parquet_files(std::string const& root_path) { return result; } +std::string get_table_path( + std::string const& input_directory, std::string const& table_name +) { + auto dir = input_directory.empty() ? "." : input_directory; + auto file_path = dir + "/" + table_name + ".parquet"; + if (std::filesystem::exists(file_path)) { + return file_path; + } + return dir + "/" + table_name + "/"; +} + +void debug_print_table( + std::shared_ptr ctx, + cudf::table_view const& table, + std::string const& label +) { + if (table.num_rows() == 0) { + ctx->comm()->logger().debug("[DEBUG] ", label, " is empty"); + return; + } + ctx->comm()->logger().debug("[DEBUG] ", label, " rows ", table.num_rows()); + + // For simplicity, just print that we have the table + // To actually print values would require type dispatch and host copies + for (cudf::size_type col_idx = 0; col_idx < table.num_columns(); ++col_idx) { + ctx->comm()->logger().debug( + " Column ", + col_idx, + ": type=", + cudf::type_to_name(table.column(col_idx).type()), + " size=", + table.column(col_idx).size(), + " nulls=", + table.column(col_idx).null_count() + ); + } +} + + } // namespace detail streaming::TableChunk to_device( diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index 9588a33b1..64b496526 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -14,11 +14,29 @@ namespace rapidsmpf::ndsh { namespace detail { [[nodiscard]] std::vector list_parquet_files(std::string const& root_path); -} +[[nodiscard]] std::string get_table_path( + std::string const& input_directory, std::string const& table_name +); +/* +Print the table to the logger. + +Note that this requires RAPIDSMPF_LOG to be set to DEBUG or TRACE. + +@param ctx The context. +@param table The table to print. +@param label The label to print. +*/ +void debug_print_table( + std::shared_ptr ctx, + cudf::table_view const& table, + std::string const& label +); +} // namespace detail [[nodiscard]] streaming::TableChunk to_device( std::shared_ptr ctx, streaming::TableChunk&& chunk, bool allow_overbooking = false ); + } // namespace rapidsmpf::ndsh From ff504f0725fe1aab4a61211e496f963aefac63fb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Dec 2025 11:52:19 -0800 Subject: [PATCH 46/75] fixup! Merge branch 'wence/fea/q03' into tom/streaming-q4 --- cpp/benchmarks/streaming/ndsh/q04.cpp | 322 +++++++++++------------- cpp/benchmarks/streaming/ndsh/utils.hpp | 3 - 2 files changed, 140 insertions(+), 185 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 8ec039908..afb54bc90 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -806,210 +807,167 @@ ProgramOptions parse_options(int argc, char** argv) { int main(int argc, char** argv) { cudaFree(nullptr); - rapidsmpf::mpi::init(&argc, &argv); - MPI_Comm mpi_comm; - RAPIDSMPF_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm)); - auto cmd_options = parse_options(argc, argv); - auto limit_size = rmm::percent_of_free_device_memory( - static_cast(cmd_options.spill_device_limit.value_or(1) * 100) - ); - rmm::mr::cuda_async_memory_resource mr{}; - auto stats_mr = rapidsmpf::RmmResourceAdaptor(&mr); - rmm::device_async_resource_ref mr_ref(stats_mr); - rmm::mr::set_current_device_resource(&stats_mr); - rmm::mr::set_current_device_resource_ref(mr_ref); - std::unordered_map - memory_available{}; - if (cmd_options.spill_device_limit.has_value()) { - memory_available[rapidsmpf::MemoryType::DEVICE] = rapidsmpf::LimitAvailableMemory{ - &stats_mr, static_cast(limit_size) - }; - } - auto br = std::make_shared( - stats_mr, std::move(memory_available) - ); - auto envvars = rapidsmpf::config::get_environment_variables(); - envvars["num_streaming_threads"] = std::to_string(cmd_options.num_streaming_threads); - auto options = rapidsmpf::config::Options(envvars); - auto stats = std::make_shared(&stats_mr); - { - auto comm = rapidsmpf::ucxx::init_using_mpi(mpi_comm, options); - auto progress = - std::make_shared(comm->logger(), stats); - auto ctx = - std::make_shared(options, comm, br, stats); - comm->logger().print( - "Executor has ", ctx->executor()->thread_count(), " threads" - ); - comm->logger().print("Executor has ", ctx->comm()->nranks(), " ranks"); - - std::string output_path = cmd_options.output_file; - std::vector timings; - for (int i = 0; i < 2; i++) { - rapidsmpf::OpID op_id{0}; - std::vector nodes; - auto start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q4 pipeline"); - // Convention for channel names: express the *output*. - /* Lineitem Table */ - // [l_commitdate, l_receiptdate, l_orderkey] - auto lineitem = ctx->create_channel(); - // [l_orderkey] - auto filtered_lineitem = ctx->create_channel(); - // [l_orderkey] - auto filtered_lineitem_shuffled = ctx->create_channel(); - - /* Orders Table */ - // [o_orderkey, o_orderpriority] - auto order = ctx->create_channel(); - - // [o_orderkey, o_orderpriority] - // Ideally this would *just* be o_orderpriority, pushing the projection - // into the join node / dropping the join key. - auto orders_x_lineitem = ctx->create_channel(); - - // [o_orderpriority] - auto projected_columns = ctx->create_channel(); - // [o_orderpriority, order_count] - auto grouped_chunkwise = ctx->create_channel(); - // [o_orderpriority, order_count] - auto grouped_concatenated = ctx->create_channel(); - // [o_orderpriority, order_count] - auto grouped_finalized = ctx->create_channel(); - // [o_orderpriority, order_count] - auto sorted = ctx->create_channel(); - - nodes.push_back(read_lineitem( - ctx, - lineitem, - 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); - nodes.push_back( - filter_lineitem(ctx, lineitem, filtered_lineitem) - ); // l_orderkey - nodes.push_back(read_orders( + + rapidsmpf::ndsh::FinalizeMPI finalize{}; + cudaFree(nullptr); + // work around https://github.com/rapidsai/cudf/issues/20849 + cudf::initialize(); + auto mr = rmm::mr::cuda_async_memory_resource{}; + auto stats_wrapper = rapidsmpf::RmmResourceAdaptor(&mr); + auto arguments = rapidsmpf::ndsh::parse_arguments(argc, argv); + auto ctx = rapidsmpf::ndsh::create_context(arguments, &stats_wrapper); + std::string output_path = arguments.output_file; + std::vector timings; + + + for (int i = 0; i < arguments.num_iterations; i++) { + rapidsmpf::OpID op_id{0}; + std::vector nodes; + auto start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Constructing Q4 pipeline"); + // Convention for channel names: express the *output*. + /* Lineitem Table */ + // [l_commitdate, l_receiptdate, l_orderkey] + auto lineitem = ctx->create_channel(); + // [l_orderkey] + auto filtered_lineitem = ctx->create_channel(); + // [l_orderkey] + auto filtered_lineitem_shuffled = ctx->create_channel(); + + /* Orders Table */ + // [o_orderkey, o_orderpriority] + auto order = ctx->create_channel(); + + // [o_orderkey, o_orderpriority] + // Ideally this would *just* be o_orderpriority, pushing the projection + // into the join node / dropping the join key. + auto orders_x_lineitem = ctx->create_channel(); + + // [o_orderpriority] + auto projected_columns = ctx->create_channel(); + // [o_orderpriority, order_count] + auto grouped_chunkwise = ctx->create_channel(); + // [o_orderpriority, order_count] + auto grouped_concatenated = ctx->create_channel(); + // [o_orderpriority, order_count] + auto grouped_finalized = ctx->create_channel(); + // [o_orderpriority, order_count] + auto sorted = ctx->create_channel(); + + nodes.push_back(read_lineitem( + ctx, lineitem, 4, arguments.num_rows_per_chunk, arguments.input_directory + )); + nodes.push_back( + filter_lineitem(ctx, lineitem, filtered_lineitem) + ); // l_orderkey + nodes.push_back(read_orders( + ctx, order, 4, arguments.num_rows_per_chunk, arguments.input_directory + )); + + // TODO: configurable + std::uint32_t num_partitions = 16; + + nodes.push_back( + rapidsmpf::ndsh::shuffle( ctx, - order, - 4, - cmd_options.num_rows_per_chunk, - cmd_options.input_directory - )); - // nodes.push_back(select_columns(ctx, order, projected_order, {1, 2})); + filtered_lineitem, + filtered_lineitem_shuffled, + {0}, + num_partitions, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + ) + ); + if (arguments.use_shuffle_join) { + auto filtered_order_shuffled = ctx->create_channel(); nodes.push_back( rapidsmpf::ndsh::shuffle( ctx, - filtered_lineitem, - filtered_lineitem_shuffled, + order, + filtered_order_shuffled, {0}, - cmd_options.num_partitions, + num_partitions, rapidsmpf::OpID{static_cast(10 * i + op_id++)} ) ); - if (cmd_options.use_shuffle_join) { - auto filtered_order_shuffled = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::shuffle( - ctx, - order, - filtered_order_shuffled, - {0}, - cmd_options.num_partitions, - rapidsmpf::OpID{ - static_cast(10 * i + op_id++) - } - ) - ); - - nodes.push_back( - rapidsmpf::ndsh::left_semi_join_shuffle( - ctx, - filtered_order_shuffled, - filtered_lineitem_shuffled, - orders_x_lineitem, - {0}, - {0} - ) - ); - } else { - nodes.push_back( - rapidsmpf::ndsh::left_semi_join_broadcast_left( - ctx, - order, - filtered_lineitem_shuffled, - orders_x_lineitem, - {0}, - {0}, - rapidsmpf::OpID{ - static_cast(10 * i + op_id++) - }, - rapidsmpf::ndsh::KeepKeys::YES - ) - ); - } - nodes.push_back( - select_columns(ctx, orders_x_lineitem, projected_columns, {1}) - ); - - nodes.push_back( - chunkwise_groupby_agg(ctx, projected_columns, grouped_chunkwise) + rapidsmpf::ndsh::left_semi_join_shuffle( + ctx, + filtered_order_shuffled, + filtered_lineitem_shuffled, + orders_x_lineitem, + {0}, + {0} + ) ); + } else { nodes.push_back( - rapidsmpf::ndsh::concatenate( + rapidsmpf::ndsh::left_semi_join_broadcast_left( ctx, - grouped_chunkwise, - grouped_concatenated, - rapidsmpf::ndsh::ConcatOrder::DONT_CARE + order, + filtered_lineitem_shuffled, + orders_x_lineitem, + {0}, + {0}, + rapidsmpf::OpID{static_cast(10 * i + op_id++)}, + rapidsmpf::ndsh::KeepKeys::YES ) ); - nodes.push_back(final_groupby_agg( + } + + nodes.push_back( + select_columns(ctx, orders_x_lineitem, projected_columns, {1}) + ); + + nodes.push_back( + chunkwise_groupby_agg(ctx, projected_columns, grouped_chunkwise) + ); + nodes.push_back( + rapidsmpf::ndsh::concatenate( ctx, + grouped_chunkwise, grouped_concatenated, - grouped_finalized, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} - )); - nodes.push_back(sort_by(ctx, grouped_finalized, sorted)); - nodes.push_back(write_parquet(ctx, sorted, output_path)); - } - auto end = std::chrono::steady_clock::now(); - std::chrono::duration pipeline = end - start; - start = std::chrono::steady_clock::now(); - { - RAPIDSMPF_NVTX_SCOPED_RANGE("Q4 Iteration"); - rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); - } - end = std::chrono::steady_clock::now(); - std::chrono::duration compute = end - start; - comm->logger().print( - "Iteration ", i, " pipeline construction time [s]: ", pipeline.count() + rapidsmpf::ndsh::ConcatOrder::DONT_CARE + ) ); - comm->logger().print("Iteration ", i, " compute time [s]: ", compute.count()); - timings.push_back(pipeline.count()); - timings.push_back(compute.count()); - ctx->comm()->logger().print(stats->report()); - RAPIDSMPF_MPI(MPI_Barrier(mpi_comm)); + nodes.push_back(final_groupby_agg( + ctx, + grouped_concatenated, + grouped_finalized, + rapidsmpf::OpID{static_cast(10 * i + op_id++)} + )); + nodes.push_back(sort_by(ctx, grouped_finalized, sorted)); + nodes.push_back(write_parquet(ctx, sorted, output_path)); } - if (comm->rank() == 0) { - for (int i = 0; i < 2; i++) { - comm->logger().print( - "Iteration ", - i, - " pipeline construction time [s]: ", - timings[size_t(2 * i)] - ); - comm->logger().print( - "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] - ); - } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration pipeline = end - start; + start = std::chrono::steady_clock::now(); + { + RAPIDSMPF_NVTX_SCOPED_RANGE("Q4 Iteration"); + rapidsmpf::streaming::run_streaming_pipeline(std::move(nodes)); } + end = std::chrono::steady_clock::now(); + std::chrono::duration compute = end - start; + timings.push_back(pipeline.count()); + timings.push_back(compute.count()); + ctx->comm()->logger().print(ctx->statistics()->report()); + ctx->statistics()->clear(); } - RAPIDSMPF_MPI(MPI_Comm_free(&mpi_comm)); - RAPIDSMPF_MPI(MPI_Finalize()); + if (ctx->comm()->rank() == 0) { + for (int i = 0; i < arguments.num_iterations; i++) { + ctx->comm()->logger().print( + "Iteration ", + i, + " pipeline construction time [s]: ", + timings[size_t(2 * i)] + ); + ctx->comm()->logger().print( + "Iteration ", i, " compute time [s]: ", timings[size_t(2 * i + 1)] + ); + } + } return 0; } diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index 63521ff24..9c7a6a674 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -95,8 +95,6 @@ namespace detail { bool allow_overbooking = false ); -<<<<<<< HEAD -======= ///< @brief Communicator type to use enum class CommType : std::uint8_t { SINGLE, ///< Single process communicator @@ -162,5 +160,4 @@ struct FinalizeMPI { } } }; ->>>>>>> wence/fea/q03 } // namespace rapidsmpf::ndsh From 20308074db2e5c018ff3a5af7c47720356cdf3be Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 07:56:05 -0800 Subject: [PATCH 47/75] Use groupby utilities --- cpp/benchmarks/streaming/ndsh/q04.cpp | 648 +++++--------------------- 1 file changed, 120 insertions(+), 528 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index afb54bc90..0b8076b34 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -1,53 +1,36 @@ /** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ -#include +#include #include -#include +#include #include -#include #include #include -#include #include #include +#include +#include #include #include -#include -#include #include #include #include -#include -#include #include -#include -#include +#include #include -#include #include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include #include #include @@ -56,69 +39,36 @@ #include #include "concatenate.hpp" +#include "groupby.hpp" #include "join.hpp" +#include "parquet_writer.hpp" +#include "sort.hpp" #include "utils.hpp" -/* Query 4 from the TPC-H benchmark. - -This performs a left semi join between the orders and lineitem tables, -followed by a grouped count aggregation on a low-cardinality column. - -```python -lineitem = pl.scan_parquet("/raid/rapidsmpf/data/tpch/scale-100.0/lineitem.parquet") -orders = pl.scan_parquet("/raid/rapidsmpf/data/tpch/scale-100.0/orders.parquet") - -var1 = date(1993, 7, 1) # 8582 -var2 = date(1993, 10, 1) # 8674 - -q = ( - # SQL exists translates to semi join in Polars API - orders.join( - (lineitem.filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))), - left_on="o_orderkey", - right_on="l_orderkey", - how="semi", - ) - .filter(pl.col("o_orderdate").is_between(var1, var2, closed="left")) - .group_by("o_orderpriority") - .agg(pl.len().alias("order_count")) - .sort("o_orderpriority") -) -``` - -Some rough stats at SF-100: - -| Scale Factor | Table / Stage | Row Count | Percent of prior | -| ------------ | ----------------- | ----------- | ---------------- | -| 100 | lineitem | 600,037,902 | - | -| 100 | lineitem-filtered | 379,356,474 | 63% | -| 100 | orders | 150,000,000 | - | -| 100 | orders-filtered | 5,733,776 | 3.8% | -| 100 | joined | 5,257,429 | 91% / 1.4% | -| 100 | groupby | 5 | 0.0% | -| 100 | final | 5 | 100% | -| 100 | sorted | 5 | 100% | - -So the lineitem filter is somewhat selective, the orders filter is very -selective, the join is a bit selective (of orders), and the final groupby -reduces by a lot. - -The left-semi join can be performed in one of two ways: - -1. Broadcast `orders` to all ranks, shuffle `lineitem`, join per chunk, concat. -2. Shuffle `orders` and `lineitem`, join per chunk, concat - -Either way, we *always* shuffle / hash-partition `lineitem` before the join. -We rely on that has partitioning to ensure that the chunkwise left-semi join -is correct (notably, how duplicates are handled). - -We don't attempt to reuse the build table (`lineitem`) in the hash partition -for multiple probe table (`orders`) chunks. That would require broadcasting -`lineitem`, which we assume is too large. -*/ - namespace { +std::vector chunkwise_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + // count(*) + aggs.emplace_back([]() { + return cudf::make_count_aggregation( + cudf::null_policy::INCLUDE + ); + }); + requests.emplace_back(0, std::move(aggs)); + return requests; +} + +std::vector final_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + // sum(count(*)) + aggs.emplace_back(cudf::make_sum_aggregation); + requests.emplace_back(1, std::move(aggs)); + return requests; +} + /* Select the columns after the join Input table: @@ -173,15 +123,6 @@ rapidsmpf::streaming::Node select_columns( co_await ch_out->drain(ctx->executor()); } -/* -Read the lineitem table. - -Output table: - - - l_commitdate - - l_receiptdate - - l_orderkey -*/ rapidsmpf::streaming::Node read_lineitem( std::shared_ptr ctx, std::shared_ptr ch_out, @@ -205,14 +146,6 @@ rapidsmpf::streaming::Node read_lineitem( ); } -/* -Read the orders table, including the filter on the o_orderdate column. - -Output table: - - - o_orderkey - - o_orderpriority -*/ rapidsmpf::streaming::Node read_orders( std::shared_ptr ctx, std::shared_ptr ch_out, @@ -350,18 +283,6 @@ rapidsmpf::streaming::Node read_orders( ); } -/* Filter the lineitem table. - -Input table: - - - l_commitdate - - l_receiptdate - - l_orderkey - -Output table: - - - l_orderkey -*/ rapidsmpf::streaming::Node filter_lineitem( std::shared_ptr ctx, std::shared_ptr ch_in, @@ -405,406 +326,39 @@ rapidsmpf::streaming::Node filter_lineitem( co_await ch_out->drain(ctx->executor()); } -/* Group the joined orders table by o_orderpriority (chunk). - -We're performing a `.group_by(...).count()`, so the chunk-stage -is just a count. - -Input table: - - - o_orderkey - - o_orderpriority - -Output table: - - - o_orderpriority - - order_count -*/ -rapidsmpf::streaming::Node chunkwise_groupby_agg( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - std::vector partial_results; - std::uint64_t sequence = 0; - while (!ch_out->is_shutdown()) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - ctx->comm()->logger().debug("Chunkwise groupby agg: no more input"); - break; - } - co_await ctx->executor()->schedule(); - auto chunk = rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - - - auto grouper = cudf::groupby::groupby( - table.select({0}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_count_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request(table.column(0), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - - auto result_table = std::make_unique(std::move(result)); - - co_await ch_out->send( - rapidsmpf::streaming::to_message( - sequence++, - std::make_unique( - std::move(result_table), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); -} - -/* Group the joined orders table by o_orderpriority (final). - -We're performing a `.group_by(...).count()`, so the final stage -is just a sum. - -Input table: - - - o_orderkey - - o_orderpriority - -Output table: - - - o_orderpriority - - order_count -*/ - -rapidsmpf::streaming::Node final_groupby_agg( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out, - rapidsmpf::OpID tag -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - auto next = co_await ch_in->receive(); - ctx->comm()->logger().debug("Final groupby"); - RAPIDSMPF_EXPECTS(next.empty(), "Expecting concatenated input at this point"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto chunk_stream = chunk.stream(); - auto table = chunk.table_view(); - - std::unique_ptr local_result{nullptr}; - if (!table.is_empty()) { - auto grouper = cudf::groupby::groupby( - table.select({0}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - - requests.push_back( - cudf::groupby::aggregation_request(table.column(1), std::move(aggs)) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - // Drop chunk, we don't need it. - std::ignore = std::move(chunk); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - local_result = std::make_unique(std::move(result)); - } - if (ctx->comm()->nranks() > 1) { - // Reduce across ranks... - // Need a reduce primitive in rapidsmpf, but let's just use an allgather and - // discard for now. - rapidsmpf::streaming::AllGather gatherer{ctx, tag}; - if (local_result) { - auto pack = - cudf::pack(local_result->view(), chunk_stream, ctx->br()->device_mr()); - gatherer.insert( - 0, - {rapidsmpf::PackedData( - std::move(pack.metadata), - ctx->br()->move(std::move(pack.gpu_data), chunk_stream) - )} - ); - } - gatherer.insert_finished(); - auto packed_data = - co_await gatherer.extract_all(rapidsmpf::streaming::AllGather::Ordered::NO); - if (ctx->comm()->rank() == 0) { - auto global_result = rapidsmpf::unpack_and_concat( - rapidsmpf::unspill_partitions( - std::move(packed_data), ctx->br(), true, ctx->statistics() - ), - chunk_stream, - ctx->br(), - ctx->statistics() - ); - if (ctx->comm()->rank() == 0) { - // We will only actually bother to do this on rank zero. - auto result_view = global_result->view(); - auto grouper = cudf::groupby::groupby( - result_view.select({0}), cudf::null_policy::EXCLUDE, cudf::sorted::NO - ); - auto requests = std::vector(); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - requests.push_back( - cudf::groupby::aggregation_request( - result_view.column(1), std::move(aggs) - ) - ); - auto [keys, results] = - grouper.aggregate(requests, chunk_stream, ctx->br()->device_mr()); - global_result.reset(); - auto result = keys->release(); - for (auto&& r : results) { - std::ranges::move(r.results, std::back_inserter(result)); - } - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::make_unique(std::move(result)), chunk_stream - ) - ) - ); - } - } else { - std::ignore = std::move(packed_data); - } - } else { - co_await ch_out->send( - rapidsmpf::streaming::to_message( - 0, - std::make_unique( - std::move(local_result), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); -} - -/* Sort the grouped orders table. - -Input table: - - - o_orderpriority - - order_count - -Output table: - - - o_orderpriority - - order_count -*/ -rapidsmpf::streaming::Node sort_by( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - ctx->comm()->logger().debug("Final sortby"); - auto msg = co_await ch_in->receive(); - // We know we only have a single chunk from the groupby - if (msg.empty()) { - co_return; - } - ctx->comm()->logger().debug("Sortby"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto table = chunk.table_view(); - auto result = rapidsmpf::streaming::to_message( - 0, - std::make_unique( - cudf::sort_by_key( - table, - table.select({0, 1}), - {cudf::order::ASCENDING, cudf::order::DESCENDING}, - {cudf::null_order::BEFORE, cudf::null_order::BEFORE}, - chunk.stream(), - ctx->br()->device_mr() - ), - chunk.stream() - ) - ); - co_await ch_out->send(std::move(result)); - co_await ch_out->drain(ctx->executor()); -} - -rapidsmpf::streaming::Node write_parquet( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::string output_path -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - co_return; - } - ctx->comm()->logger().debug("write parquet"); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); - auto sink = cudf::io::sink_info(output_path); - auto builder = cudf::io::parquet_writer_options::builder(sink, chunk.table_view()); - auto metadata = cudf::io::table_input_metadata(chunk.table_view()); - metadata.column_metadata[0].set_name("o_orderpriority"); - metadata.column_metadata[1].set_name("order_count"); - builder = builder.metadata(metadata); - auto options = builder.build(); - cudf::io::write_parquet(options, chunk.stream()); - ctx->comm()->logger().print( - "Wrote chunk with ", - chunk.table_view().num_rows(), - " rows and ", - chunk.table_view().num_columns(), - " columns to ", - output_path - ); -} - - } // namespace -struct ProgramOptions { - int num_streaming_threads{1}; - cudf::size_type num_rows_per_chunk{100'000'000}; - std::optional spill_device_limit{std::nullopt}; - bool use_shuffle_join = false; - std::string output_file; - std::string input_directory; - std::uint32_t num_partitions{16}; -}; - -// TODO: Refactor to common utilities -ProgramOptions parse_options(int argc, char** argv) { - ProgramOptions options; - - auto print_usage = [&argv]() { - std::cerr - << "Usage: " << argv[0] << " [options]\n" - << "Options:\n" - << " --num-streaming-threads Number of streaming threads (default: 1)\n" - << " --num-rows-per-chunk Number of rows per chunk (default: " - "100000000)\n" - << " --spill-device-limit Fractional spill device limit (default: " - "None)\n" - << " --use-shuffle-join Use shuffle join (default: false)\n" - << " --output-file Output file path (required)\n" - << " --input-directory Input directory path (required)\n" - << " --num-partitions Number of partitions (default: 16)\n" - << " --help Show this help message\n"; - }; - - static std::array long_options = {{ - {.name = "num-streaming-threads", - .has_arg = required_argument, - .flag = nullptr, - .val = 1}, - {.name = "num-rows-per-chunk", - .has_arg = required_argument, - .flag = nullptr, - .val = 2}, - {.name = "use-shuffle-join", .has_arg = no_argument, .flag = nullptr, .val = 3}, - {.name = "output-file", .has_arg = required_argument, .flag = nullptr, .val = 4}, - {.name = "input-directory", - .has_arg = required_argument, - .flag = nullptr, - .val = 5}, - {.name = "num-partitions", - .has_arg = required_argument, - .flag = nullptr, - .val = 6}, - {.name = "help", .has_arg = no_argument, .flag = nullptr, .val = 6}, - {.name = "spill-device-limit", - .has_arg = required_argument, - .flag = nullptr, - .val = 7}, - {.name = nullptr, .has_arg = 0, .flag = nullptr, .val = 0}, - }}; - - int opt; - int option_index = 0; - - bool saw_output_file = false; - bool saw_input_directory = false; - - while ((opt = getopt_long(argc, argv, "", long_options.data(), &option_index)) != -1) - { - switch (opt) { - case 1: - options.num_streaming_threads = std::atoi(optarg); - break; - case 2: - options.num_rows_per_chunk = std::atoi(optarg); - break; - case 3: - options.use_shuffle_join = true; - break; - case 4: - options.output_file = optarg; - saw_output_file = true; - break; - case 5: - options.input_directory = optarg; - saw_input_directory = true; - break; - case 6: - options.num_partitions = static_cast(std::atoi(optarg)); - break; - case 7: - print_usage(); - std::exit(0); - case 8: - options.spill_device_limit = std::stod(optarg); - break; - case '?': - if (optopt == 0 && optind > 1) { - std::cerr << "Error: Unknown option '" << argv[optind - 1] << "'\n\n"; - } - print_usage(); - std::exit(1); - default: - print_usage(); - std::exit(1); - } - } - - // Check if required options were provided - if (!saw_output_file || !saw_input_directory) { - if (!saw_output_file) { - std::cerr << "Error: --output-file is required\n"; - } - if (!saw_input_directory) { - std::cerr << "Error: --input-directory is required\n"; - } - std::cerr << std::endl; - print_usage(); - std::exit(1); - } - - return options; -} - +/** + * @brief Run a derived version of TPCH-query 4. + * + * The SQL form of the query is: + * @code{.sql} + * + * SELECT + * o_orderpriority, + * count(*) as order_count + * FROM + * orders + * where + * o_orderdate >= TIMESTAMP '1993-07-01' + * and o_orderdate < TIMESTAMP '1993-07-01' + INTERVAL '3' MONTH + * and EXISTS ( + * SELECT + * * + * FROM + * lineitem + * WHERE + * l_orderkey = o_orderkey + * and l_commitdate < l_receiptdate + * ) + * GROUP BY + * o_orderpriority + * ORDER BY + * o_orderpriority + * @endcode{} + * + * The "exists" clause is translated into a left-semi join in libcudf. + */ int main(int argc, char** argv) { cudaFree(nullptr); @@ -848,12 +402,6 @@ int main(int argc, char** argv) { auto projected_columns = ctx->create_channel(); // [o_orderpriority, order_count] auto grouped_chunkwise = ctx->create_channel(); - // [o_orderpriority, order_count] - auto grouped_concatenated = ctx->create_channel(); - // [o_orderpriority, order_count] - auto grouped_finalized = ctx->create_channel(); - // [o_orderpriority, order_count] - auto sorted = ctx->create_channel(); nodes.push_back(read_lineitem( ctx, lineitem, 4, arguments.num_rows_per_chunk, arguments.input_directory @@ -922,24 +470,68 @@ int main(int argc, char** argv) { ); nodes.push_back( - chunkwise_groupby_agg(ctx, projected_columns, grouped_chunkwise) - ); - nodes.push_back( - rapidsmpf::ndsh::concatenate( + rapidsmpf::ndsh::chunkwise_group_by( ctx, + projected_columns, grouped_chunkwise, - grouped_concatenated, - rapidsmpf::ndsh::ConcatOrder::DONT_CARE + {0}, + chunkwise_groupby_requests(), + cudf::null_policy::INCLUDE ) ); - nodes.push_back(final_groupby_agg( - ctx, - grouped_concatenated, - grouped_finalized, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} - )); - nodes.push_back(sort_by(ctx, grouped_finalized, sorted)); - nodes.push_back(write_parquet(ctx, sorted, output_path)); + auto final_groupby_input = ctx->create_channel(); + if (ctx->comm()->nranks() > 1) { + nodes.push_back( + rapidsmpf::ndsh::broadcast( + ctx, + grouped_chunkwise, + final_groupby_input, + static_cast(10 * i + op_id++), + rapidsmpf::streaming::AllGather::Ordered::NO + ) + ); + } else { + nodes.push_back( + rapidsmpf::ndsh::concatenate( + ctx, grouped_chunkwise, final_groupby_input + ) + ); + } + if (ctx->comm()->rank() == 0) { + auto final_groupby_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0}, + final_groupby_requests(), + cudf::null_policy::INCLUDE + ) + ); + auto sorted_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_sort_by( + ctx, + final_groupby_output, + sorted_output, + {0}, + {0, 1}, + {cudf::order::ASCENDING}, + {cudf::null_order::BEFORE} + ) + ); + nodes.push_back( + rapidsmpf::ndsh::write_parquet( + ctx, + sorted_output, + cudf::io::sink_info(output_path), + {"o_orderpriority", "order_count"} + ) + ); + } else { + nodes.push_back(rapidsmpf::ndsh::sink_channel(ctx, final_groupby_input)); + } } auto end = std::chrono::steady_clock::now(); std::chrono::duration pipeline = end - start; From 16be63bcc4dcac1e340f3c920a77411317717488 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 08:06:25 -0800 Subject: [PATCH 48/75] fuse final groupby agg --- cpp/benchmarks/streaming/ndsh/q04.cpp | 103 +++++++++++++++++++------- 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 0b8076b34..fd1e24e03 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -19,9 +19,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -42,7 +44,6 @@ #include "groupby.hpp" #include "join.hpp" #include "parquet_writer.hpp" -#include "sort.hpp" #include "utils.hpp" namespace { @@ -60,13 +61,78 @@ std::vector chunkwise_groupby_requests() { return requests; } -std::vector final_groupby_requests() { - auto requests = std::vector(); - std::vector()>> aggs; - // sum(count(*)) - aggs.emplace_back(cudf::make_sum_aggregation); - requests.emplace_back(1, std::move(aggs)); - return requests; +/* Perform final groupby aggregation and sort. + +Since the cardinality of o_orderpriority is very low, the chunkwise groupby +produces a set of small tables that fits comfortably in memory. We can perform +regular cudf groupby and sort operations instead of streaming versions. + +Input table: + - o_orderpriority + - order_count (partial counts from chunkwise groupby) + +Output table: + - o_orderpriority (sorted ascending) + - order_count (sum of partial counts) +*/ +rapidsmpf::streaming::Node final_groupby_and_sort( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + RAPIDSMPF_EXPECTS( + (co_await ch_in->receive()).empty(), "Expecting concatenated input at this point" + ); + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto stream = chunk.stream(); + auto mr = ctx->br()->device_mr(); + auto table = chunk.table_view(); + + // Perform groupby sum aggregation + auto grouper = cudf::groupby::groupby( + table.select({0}), cudf::null_policy::INCLUDE, cudf::sorted::NO + ); + std::vector> aggs; + aggs.push_back(cudf::make_sum_aggregation()); + std::vector requests; + requests.push_back( + cudf::groupby::aggregation_request{ + .values = table.column(1), .aggregations = std::move(aggs) + } + ); + auto [keys, aggregated] = grouper.aggregate(requests, stream, mr); + + // Build result table + auto result = keys->release(); + for (auto&& a : aggregated) { + std::ranges::move(a.results, std::back_inserter(result)); + } + std::ignore = std::move(chunk); + auto grouped_table = std::make_unique(std::move(result)); + + // Sort by o_orderpriority ascending + auto sorted_table = cudf::sort_by_key( + grouped_table->view(), + grouped_table->view().select({0}), + {cudf::order::ASCENDING}, + {cudf::null_order::BEFORE}, + stream, + mr + ); + + co_await ch_out->send( + rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::move(sorted_table), stream + ) + ) + ); + co_await ch_out->drain(ctx->executor()); } /* Select the columns after the join @@ -498,28 +564,9 @@ int main(int argc, char** argv) { ); } if (ctx->comm()->rank() == 0) { - auto final_groupby_output = ctx->create_channel(); - nodes.push_back( - rapidsmpf::ndsh::chunkwise_group_by( - ctx, - final_groupby_input, - final_groupby_output, - {0}, - final_groupby_requests(), - cudf::null_policy::INCLUDE - ) - ); auto sorted_output = ctx->create_channel(); nodes.push_back( - rapidsmpf::ndsh::chunkwise_sort_by( - ctx, - final_groupby_output, - sorted_output, - {0}, - {0, 1}, - {cudf::order::ASCENDING}, - {cudf::null_order::BEFORE} - ) + final_groupby_and_sort(ctx, final_groupby_input, sorted_output) ); nodes.push_back( rapidsmpf::ndsh::write_parquet( From f51ef593275b651d93dceadfea626259ddc5b28c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 12:21:20 -0800 Subject: [PATCH 49/75] revert fuse --- cpp/benchmarks/streaming/ndsh/q04.cpp | 58 ++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index fd1e24e03..eefa64f18 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -61,21 +61,21 @@ std::vector chunkwise_groupby_requests() { return requests; } -/* Perform final groupby aggregation and sort. +/* Perform final groupby aggregation. Since the cardinality of o_orderpriority is very low, the chunkwise groupby produces a set of small tables that fits comfortably in memory. We can perform -regular cudf groupby and sort operations instead of streaming versions. +regular cudf groupby operations instead of streaming versions. Input table: - o_orderpriority - order_count (partial counts from chunkwise groupby) Output table: - - o_orderpriority (sorted ascending) + - o_orderpriority - order_count (sum of partial counts) */ -rapidsmpf::streaming::Node final_groupby_and_sort( +rapidsmpf::streaming::Node final_groupby_agg( std::shared_ptr ctx, std::shared_ptr ch_in, std::shared_ptr ch_out @@ -112,12 +112,48 @@ rapidsmpf::streaming::Node final_groupby_and_sort( std::ranges::move(a.results, std::back_inserter(result)); } std::ignore = std::move(chunk); - auto grouped_table = std::make_unique(std::move(result)); - // Sort by o_orderpriority ascending + co_await ch_out->send( + rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::make_unique(std::move(result)), stream + ) + ) + ); + co_await ch_out->drain(ctx->executor()); +} + +/* Sort the grouped orders table by o_orderpriority. + +Input table: + - o_orderpriority + - order_count + +Output table: + - o_orderpriority (sorted ascending) + - order_count +*/ +rapidsmpf::streaming::Node sort_by( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + co_return; + } + auto chunk = + rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto stream = chunk.stream(); + auto mr = ctx->br()->device_mr(); + auto table = chunk.table_view(); + auto sorted_table = cudf::sort_by_key( - grouped_table->view(), - grouped_table->view().select({0}), + table, + table.select({0}), {cudf::order::ASCENDING}, {cudf::null_order::BEFORE}, stream, @@ -564,10 +600,12 @@ int main(int argc, char** argv) { ); } if (ctx->comm()->rank() == 0) { - auto sorted_output = ctx->create_channel(); + auto final_groupby_output = ctx->create_channel(); nodes.push_back( - final_groupby_and_sort(ctx, final_groupby_input, sorted_output) + final_groupby_agg(ctx, final_groupby_input, final_groupby_output) ); + auto sorted_output = ctx->create_channel(); + nodes.push_back(sort_by(ctx, final_groupby_output, sorted_output)); nodes.push_back( rapidsmpf::ndsh::write_parquet( ctx, From a3a04138d20ce5ecbaf0946acf96ba24f3ce0b69 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 12:21:31 -0800 Subject: [PATCH 50/75] Add binary --- cpp/benchmarks/streaming/ndsh/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt index 4077174ad..49fa7afd6 100644 --- a/cpp/benchmarks/streaming/ndsh/CMakeLists.txt +++ b/cpp/benchmarks/streaming/ndsh/CMakeLists.txt @@ -41,7 +41,7 @@ target_link_libraries( $ maybe_asan ) -set(RAPIDSMPFNDSH_QUERIES q01 q03 q09) +set(RAPIDSMPFNDSH_QUERIES q01 q03 q04 q09) foreach(query IN ITEMS ${RAPIDSMPFNDSH_QUERIES}) add_executable(${query} "${query}.cpp") From b733c9faf54426942ca2bfe4c62a88373f6800a0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 12:28:23 -0800 Subject: [PATCH 51/75] fixup --- cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 72c4104bd..52a3b8126 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -125,7 +125,6 @@ include(../cmake/thirdparty/get_nvtx.cmake) include(../cmake/thirdparty/get_rmm.cmake) include(../cmake/thirdparty/get_cuco.cmake) include(../cmake/thirdparty/get_cudf.cmake) -include(../cmake/thirdparty/get_cuco.cmake) if(RAPIDSMPF_HAVE_UCXX) rapids_find_package( ucxx REQUIRED From 7a2780acc22a867170cf33ae684474b63a1168ab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 12:29:25 -0800 Subject: [PATCH 52/75] fixup --- cpp/include/rapidsmpf/communicator/mpi.hpp | 2 -- cpp/src/communicator/mpi.cpp | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/include/rapidsmpf/communicator/mpi.hpp b/cpp/include/rapidsmpf/communicator/mpi.hpp index 7cdfb02c3..f403259c3 100644 --- a/cpp/include/rapidsmpf/communicator/mpi.hpp +++ b/cpp/include/rapidsmpf/communicator/mpi.hpp @@ -121,8 +121,6 @@ class MPI final : public Communicator { */ MPI(MPI_Comm comm, config::Options options); - ~MPI() noexcept override; - /** * @copydoc Communicator::rank */ diff --git a/cpp/src/communicator/mpi.cpp b/cpp/src/communicator/mpi.cpp index 9e02f827f..767b9db6d 100644 --- a/cpp/src/communicator/mpi.cpp +++ b/cpp/src/communicator/mpi.cpp @@ -96,10 +96,10 @@ void check_mpi_thread_support() { } } // namespace -MPI::MPI(MPI_Comm comm, config::Options options) : logger_{this, std::move(options)} { +MPI::MPI(MPI_Comm comm, config::Options options) + : comm_{comm}, logger_{this, std::move(options)} { int rank; int nranks; - MPI_Comm_dup(comm, &comm_); RAPIDSMPF_MPI(MPI_Comm_rank(comm_, &rank)); RAPIDSMPF_MPI(MPI_Comm_size(comm_, &nranks)); rank_ = rank; @@ -107,10 +107,6 @@ MPI::MPI(MPI_Comm comm, config::Options options) : logger_{this, std::move(optio check_mpi_thread_support(); } -MPI::~MPI() noexcept { - RAPIDSMPF_MPI(MPI_Comm_free(&comm_)); -} - std::unique_ptr MPI::send( std::unique_ptr> msg, Rank rank, Tag tag ) { From d0130626676b5197315273e0ea546d4fcd7a2e33 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 16 Dec 2025 13:07:04 -0800 Subject: [PATCH 53/75] Use a bloom filter --- cpp/benchmarks/streaming/ndsh/q04.cpp | 113 +++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index eefa64f18..76ed7cdfb 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include #include #include #include @@ -19,7 +20,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -40,6 +43,7 @@ #include #include +#include "bloom_filter.hpp" #include "concatenate.hpp" #include "groupby.hpp" #include "join.hpp" @@ -428,6 +432,70 @@ rapidsmpf::streaming::Node filter_lineitem( co_await ch_out->drain(ctx->executor()); } +[[maybe_unused]] +rapidsmpf::streaming::Node fanout_bounded( + std::shared_ptr ctx, + std::shared_ptr ch_in, + std::shared_ptr ch1_out, + std::vector ch1_cols, + std::shared_ptr ch2_out +) { + rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch1_out, ch2_out}; + + co_await ctx->executor()->schedule(); + while (true) { + auto msg = co_await ch_in->receive(); + if (msg.empty()) { + break; + } + auto chunk = rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); + // Here, we know that copying ch1_cols (a single col) is better than copying + // ch2_cols (the whole table) + std::vector> tasks; + if (!ch1_out->is_shutdown()) { + auto msg1 = rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique( + std::make_unique( + chunk.table_view().select(ch1_cols), + chunk.stream(), + ctx->br()->device_mr() + ), + chunk.stream() + ) + ); + tasks.push_back(ch1_out->send(std::move(msg1))); + } + if (!ch2_out->is_shutdown()) { + // TODO: We know here that ch2 wants the whole table. + tasks.push_back(ch2_out->send( + rapidsmpf::streaming::to_message( + msg.sequence_number(), + std::make_unique(std::move(chunk)) + ) + )); + } + if (!std::ranges::any_of( + rapidsmpf::streaming::coro_results( + co_await coro::when_all(std::move(tasks)) + ), + std::identity{} + )) + { + ctx->comm()->logger().print("Breaking after ", msg.sequence_number()); + break; + }; + } + + rapidsmpf::streaming::coro_results( + co_await coro::when_all( + ch1_out->drain(ctx->executor()), ch2_out->drain(ctx->executor()) + ) + ); +} + } // namespace /** @@ -475,6 +543,13 @@ int main(int argc, char** argv) { std::string output_path = arguments.output_file; std::vector timings; + int l2size; + int device; + RAPIDSMPF_CUDA_TRY(cudaGetDevice(&device)); + RAPIDSMPF_CUDA_TRY(cudaDeviceGetAttribute(&l2size, cudaDevAttrL2CacheSize, device)); + auto const num_filter_blocks = rapidsmpf::ndsh::BloomFilter::fitting_num_blocks( + static_cast(l2size) + ); for (int i = 0; i < arguments.num_iterations; i++) { rapidsmpf::OpID op_id{0}; @@ -515,13 +590,45 @@ int main(int argc, char** argv) { ctx, order, 4, arguments.num_rows_per_chunk, arguments.input_directory )); + // Fanout filtered orders: one for bloom filter, one for join + auto bloom_filter_input = ctx->create_channel(); + auto orders_for_join = ctx->create_channel(); + nodes.push_back( + fanout_bounded(ctx, order, bloom_filter_input, {0}, orders_for_join) + ); + + // Build bloom filter from filtered orders' o_orderkey + auto bloom_filter_output = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::build_bloom_filter( + ctx, + bloom_filter_input, + bloom_filter_output, + static_cast(10 * i + op_id++), + cudf::DEFAULT_HASH_SEED, + num_filter_blocks + ) + ); + + // Apply bloom filter to filtered lineitem before shuffling + auto bloom_filtered_lineitem = ctx->create_channel(); + nodes.push_back( + rapidsmpf::ndsh::apply_bloom_filter( + ctx, + bloom_filter_output, + filtered_lineitem, + bloom_filtered_lineitem, + {0} + ) + ); + // TODO: configurable std::uint32_t num_partitions = 16; nodes.push_back( rapidsmpf::ndsh::shuffle( ctx, - filtered_lineitem, + bloom_filtered_lineitem, filtered_lineitem_shuffled, {0}, num_partitions, @@ -534,7 +641,7 @@ int main(int argc, char** argv) { nodes.push_back( rapidsmpf::ndsh::shuffle( ctx, - order, + orders_for_join, filtered_order_shuffled, {0}, num_partitions, @@ -556,7 +663,7 @@ int main(int argc, char** argv) { nodes.push_back( rapidsmpf::ndsh::left_semi_join_broadcast_left( ctx, - order, + orders_for_join, filtered_lineitem_shuffled, orders_x_lineitem, {0}, From 5152b12bf0243092e3606e525a1a773052bd01ec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 17 Dec 2025 12:58:18 -0800 Subject: [PATCH 54/75] Note on why we shuffle --- cpp/benchmarks/streaming/ndsh/q04.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 76ed7cdfb..c8ad3c08a 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -622,9 +622,14 @@ int main(int argc, char** argv) { ) ); - // TODO: configurable + // We unconditionally shuffle the filtered lineitem table. This is + // necessary to correctly handle duplicates in the left-semi join. + // Failing to shuffle (hash partition) the right table on the join + // key could allow a record to match multiple times from the + // multiple partitions of the right table. TODO: configurable + // + // num_partitions std::uint32_t num_partitions = 16; - nodes.push_back( rapidsmpf::ndsh::shuffle( ctx, From ea64499366568c3d083cb21c4a21a46bd973c38f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Dec 2025 06:30:33 -0800 Subject: [PATCH 55/75] Streams, events, joins --- cpp/benchmarks/streaming/ndsh/join.cpp | 38 +++++++++++++++++--------- cpp/benchmarks/streaming/ndsh/q04.cpp | 6 ++-- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index e8a12b932..fa492c717 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -161,26 +161,27 @@ streaming::Message semi_join_chunk( streaming::TableChunk&& right_chunk, [[maybe_unused]] std::vector left_on, std::vector right_on, - std::uint64_t sequence + std::uint64_t sequence, + CudaEvent* left_event ) { - CudaEvent event; + CudaEvent event; // TODO: see if this is needed for deallocation. right_chunk = to_device(ctx, std::move(right_chunk)); + auto chunk_stream = right_chunk.stream(); + + left_event->stream_wait(chunk_stream); + + // At this point, both left_chunk and right_chunk are valid on + // either stream. We'll do everything from here out on the + // right_chunk.stream(), so that we don't introduce false dependencies + // between the different chunks. auto joiner = cudf::filtered_join( right_chunk.table_view().select(right_on), cudf::null_equality::UNEQUAL, cudf::set_as_build_table::RIGHT, - left_chunk.stream() + chunk_stream ); - // We need data to be ready on both left and right sides of the table, - // so the `semi_join` must be on a stream that's downstream of both left and right. - CudaEvent build_event; - build_event.record(left_chunk.stream()); // build_event downstream of left - - auto chunk_stream = right_chunk.stream(); - build_event.stream_wait(chunk_stream); // build_event downstream of right - auto match = joiner.semi_join( left_chunk.table_view().select(left_on), chunk_stream, ctx->br()->device_mr() ); @@ -204,6 +205,8 @@ streaming::Message semi_join_chunk( ctx->comm()->logger().debug( "semi_join_chunk: result_table.num_rows()=", result_table->num_rows() ); + // Deallocation of the join indices will happen on chunk_stream, so add stream dep + cuda_stream_join(left_chunk.stream(), chunk_stream, &event); return streaming::to_message( sequence, std::make_unique(std::move(result_table), chunk_stream) @@ -454,6 +457,8 @@ streaming::Node left_semi_join_broadcast_left( ctx->comm()->logger().print( "Left (probe) table has ", left_table.table_view().num_rows(), " rows" ); + CudaEvent left_event; + left_event.record(left_table.stream()); std::size_t sequence = 0; while (true) { @@ -467,7 +472,8 @@ streaming::Node left_semi_join_broadcast_left( right_msg.release(), left_on, right_on, - sequence++ + sequence++, + &left_event )); } @@ -491,6 +497,11 @@ streaming::Node left_semi_join_shuffle( // Requirement: two shuffles kick out partitions in the same order auto left_msg = co_await left->receive(); auto right_msg = co_await right->receive(); + + // We don't have any dependencies across chunks, so make an event per chunk pair. + CudaEvent left_event; + left_event.record(left_msg.release().stream()); + if (left_msg.empty()) { RAPIDSMPF_EXPECTS( right_msg.empty(), "Left does not have same number of partitions as right" @@ -508,7 +519,8 @@ streaming::Node left_semi_join_shuffle( right_msg.release(), left_on, right_on, - left_msg.sequence_number() + left_msg.sequence_number(), + &left_event )); } } diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index c8ad3c08a..aeb2198cf 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -626,9 +626,9 @@ int main(int argc, char** argv) { // necessary to correctly handle duplicates in the left-semi join. // Failing to shuffle (hash partition) the right table on the join // key could allow a record to match multiple times from the - // multiple partitions of the right table. TODO: configurable - // - // num_partitions + // multiple partitions of the right table. + + // TODO: configurable num_partitions std::uint32_t num_partitions = 16; nodes.push_back( rapidsmpf::ndsh::shuffle( From 8098cd290b816d5eb6ea664b704bbaae2e689fd3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Jan 2026 06:01:17 -0800 Subject: [PATCH 56/75] lint --- cpp/benchmarks/streaming/ndsh/join.hpp | 2 +- cpp/benchmarks/streaming/ndsh/q04.cpp | 2 +- cpp/include/rapidsmpf/communicator/mpi.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.hpp b/cpp/benchmarks/streaming/ndsh/join.hpp index de91ca77c..3dea086e5 100644 --- a/cpp/benchmarks/streaming/ndsh/join.hpp +++ b/cpp/benchmarks/streaming/ndsh/join.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index aeb2198cf..0d212e9c8 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -1,6 +1,6 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ diff --git a/cpp/include/rapidsmpf/communicator/mpi.hpp b/cpp/include/rapidsmpf/communicator/mpi.hpp index f403259c3..c63c07f42 100644 --- a/cpp/include/rapidsmpf/communicator/mpi.hpp +++ b/cpp/include/rapidsmpf/communicator/mpi.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ #pragma once From 6ff31f914a570f1e2e295ab58cfc277121236fde Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:25:16 -0800 Subject: [PATCH 57/75] Compiling again --- cpp/benchmarks/streaming/ndsh/join.cpp | 28 ++++++++++++++++---------- cpp/benchmarks/streaming/ndsh/q04.cpp | 16 ++++++++------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 0fcac82e2..0e778785d 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -161,13 +161,12 @@ streaming::Message semi_join_chunk( std::shared_ptr ctx, streaming::TableChunk const& left_chunk, streaming::TableChunk&& right_chunk, - [[maybe_unused]] std::vector left_on, + std::vector left_on, std::vector right_on, std::uint64_t sequence, CudaEvent* left_event ) { - CudaEvent event; // TODO: see if this is needed for deallocation. - right_chunk = to_device(ctx, std::move(right_chunk)); + CudaEvent event; auto chunk_stream = right_chunk.stream(); left_event->stream_wait(chunk_stream); @@ -452,7 +451,7 @@ streaming::Node left_semi_join_broadcast_left( streaming::ShutdownAtExit c{left, right, ch_out}; co_await ctx->executor()->schedule(); ctx->comm()->logger().print("Inner broadcast join ", static_cast(tag)); - auto left_table = to_device( + auto left_table = co_await to_device( ctx, (co_await broadcast(ctx, left, tag)).release() ); ctx->comm()->logger().print( @@ -467,10 +466,12 @@ streaming::Node left_semi_join_broadcast_left( if (right_msg.empty()) { break; } + auto right_chunk = + co_await to_device(ctx, right_msg.release()); co_await ch_out->send(semi_join_chunk( ctx, left_table, - right_msg.release(), + std::move(right_chunk), left_on, right_on, sequence++, @@ -499,10 +500,6 @@ streaming::Node left_semi_join_shuffle( auto left_msg = co_await left->receive(); auto right_msg = co_await right->receive(); - // We don't have any dependencies across chunks, so make an event per chunk pair. - CudaEvent left_event; - left_event.record(left_msg.release().stream()); - if (left_msg.empty()) { RAPIDSMPF_EXPECTS( right_msg.empty(), "Left does not have same number of partitions as right" @@ -514,10 +511,19 @@ streaming::Node left_semi_join_shuffle( "Mismatching sequence numbers" ); + auto left_chunk = + co_await to_device(ctx, left_msg.release()); + auto right_chunk = + co_await to_device(ctx, right_msg.release()); + + // We don't have any dependencies across chunks, so make an event per chunk pair. + CudaEvent left_event; + left_event.record(left_chunk.stream()); + co_await ch_out->send(semi_join_chunk( ctx, - left_msg.release(), - right_msg.release(), + left_chunk, + std::move(right_chunk), left_on, right_on, left_msg.sequence_number(), diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 0d212e9c8..2d032e964 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -90,8 +90,9 @@ rapidsmpf::streaming::Node final_groupby_agg( RAPIDSMPF_EXPECTS( (co_await ch_in->receive()).empty(), "Expecting concatenated input at this point" ); - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto chunk = co_await rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); auto stream = chunk.stream(); auto mr = ctx->br()->device_mr(); auto table = chunk.table_view(); @@ -149,8 +150,9 @@ rapidsmpf::streaming::Node sort_by( if (msg.empty()) { co_return; } - auto chunk = - rapidsmpf::ndsh::to_device(ctx, msg.release()); + auto chunk = co_await rapidsmpf::ndsh::to_device( + ctx, msg.release() + ); auto stream = chunk.stream(); auto mr = ctx->br()->device_mr(); auto table = chunk.table_view(); @@ -198,7 +200,7 @@ rapidsmpf::streaming::Node select_columns( break; } co_await ctx->executor()->schedule(); - auto chunk = rapidsmpf::ndsh::to_device( + auto chunk = co_await rapidsmpf::ndsh::to_device( ctx, msg.release() ); auto chunk_stream = chunk.stream(); @@ -402,7 +404,7 @@ rapidsmpf::streaming::Node filter_lineitem( break; } co_await ctx->executor()->schedule(); - auto chunk = rapidsmpf::ndsh::to_device( + auto chunk = co_await rapidsmpf::ndsh::to_device( ctx, msg.release() ); auto chunk_stream = chunk.stream(); @@ -448,7 +450,7 @@ rapidsmpf::streaming::Node fanout_bounded( if (msg.empty()) { break; } - auto chunk = rapidsmpf::ndsh::to_device( + auto chunk = co_await rapidsmpf::ndsh::to_device( ctx, msg.release() ); // Here, we know that copying ch1_cols (a single col) is better than copying From 05782b3419f3cd8cbe17158a7fc926de298ea65e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:36:13 -0800 Subject: [PATCH 58/75] remove duplicate log --- cpp/benchmarks/streaming/ndsh/join.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 0e778785d..44a68dfd7 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -203,9 +203,6 @@ streaming::Message semi_join_chunk( ->release(); auto result_table = std::make_unique(std::move(result_columns)); - ctx->comm()->logger().debug( - "semi_join_chunk: result_table.num_rows()=", result_table->num_rows() - ); // Deallocation of the join indices will happen on chunk_stream, so add stream dep cuda_stream_join(left_chunk.stream(), chunk_stream, &event); return streaming::to_message( From 4ae399c538d1aa72d799df23b2827d7c3675d6de Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:37:44 -0800 Subject: [PATCH 59/75] remove unused event --- cpp/benchmarks/streaming/ndsh/join.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 44a68dfd7..3c8c2c66f 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -166,7 +166,6 @@ streaming::Message semi_join_chunk( std::uint64_t sequence, CudaEvent* left_event ) { - CudaEvent event; auto chunk_stream = right_chunk.stream(); left_event->stream_wait(chunk_stream); @@ -204,7 +203,8 @@ streaming::Message semi_join_chunk( auto result_table = std::make_unique(std::move(result_columns)); // Deallocation of the join indices will happen on chunk_stream, so add stream dep - cuda_stream_join(left_chunk.stream(), chunk_stream, &event); + cuda_stream_join(left_chunk.stream(), chunk_stream); + return streaming::to_message( sequence, std::make_unique(std::move(result_table), chunk_stream) From 3bea0dff601898df20c9b3d283f9b15d403d441e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:38:25 -0800 Subject: [PATCH 60/75] fix while condition --- cpp/benchmarks/streaming/ndsh/join.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 3c8c2c66f..bd34effaf 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -458,7 +458,7 @@ streaming::Node left_semi_join_broadcast_left( left_event.record(left_table.stream()); std::size_t sequence = 0; - while (true) { + while (!ch_out->is_shutdown()) { auto right_msg = co_await right->receive(); if (right_msg.empty()) { break; From 672af5d7eb0c2c51c6ca47321c107112d9dd23bc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:46:41 -0800 Subject: [PATCH 61/75] fixes --- cpp/benchmarks/streaming/ndsh/join.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index bd34effaf..3340969c4 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -457,7 +457,6 @@ streaming::Node left_semi_join_broadcast_left( CudaEvent left_event; left_event.record(left_table.stream()); - std::size_t sequence = 0; while (!ch_out->is_shutdown()) { auto right_msg = co_await right->receive(); if (right_msg.empty()) { @@ -471,7 +470,7 @@ streaming::Node left_semi_join_broadcast_left( std::move(right_chunk), left_on, right_on, - sequence++, + right_msg.sequence_number(), &left_event )); } @@ -491,6 +490,7 @@ streaming::Node left_semi_join_shuffle( ctx->comm()->logger().print("Shuffle left semi join"); co_await ctx->executor()->schedule(); + CudaEvent left_event; while (true) { // Requirement: two shuffles kick out partitions in the same order @@ -513,8 +513,6 @@ streaming::Node left_semi_join_shuffle( auto right_chunk = co_await to_device(ctx, right_msg.release()); - // We don't have any dependencies across chunks, so make an event per chunk pair. - CudaEvent left_event; left_event.record(left_chunk.stream()); co_await ch_out->send(semi_join_chunk( @@ -544,7 +542,7 @@ streaming::Node shuffle( while (true) { auto msg = co_await ch_in->receive(); if (msg.empty()) { - ctx->comm()->logger().print("Shuffle: no more input"); + ctx->comm()->logger().debug("Shuffle: no more input"); break; } auto chunk = co_await to_device(ctx, msg.release()); From 9c8a21a2daa436e6c8f8fb2ef614911cd62e1c89 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:51:45 -0800 Subject: [PATCH 62/75] revert MPI change --- cpp/include/rapidsmpf/communicator/mpi.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/rapidsmpf/communicator/mpi.hpp b/cpp/include/rapidsmpf/communicator/mpi.hpp index 0d9e38fee..8a0830ea9 100644 --- a/cpp/include/rapidsmpf/communicator/mpi.hpp +++ b/cpp/include/rapidsmpf/communicator/mpi.hpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -121,6 +121,8 @@ class MPI final : public Communicator { */ MPI(MPI_Comm comm, config::Options options); + ~MPI() noexcept override = default; + /** * @copydoc Communicator::rank */ From 7c16d6733e4202889ae3aab3230e74e661dca502 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Jan 2026 08:57:17 -0800 Subject: [PATCH 63/75] docstring fixes --- cpp/benchmarks/streaming/ndsh/join.cpp | 23 ----------------------- cpp/benchmarks/streaming/ndsh/join.hpp | 5 +++-- 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 3340969c4..2191462b6 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -412,29 +412,6 @@ streaming::Node inner_join_shuffle( co_await ch_out->drain(ctx->executor()); } -/** - * @brief Perform a left semi join between two tables, broadcasting the left table to all - * ranks. - * - * @param ctx Streaming context - * @param left Channel of `TableChunk`s used as the broadcasted build side. This table is - * broadcasted to all ranks. - * @param right Channel of `TableChunk`s joined in turn against the build side. This table - * is required to be shuffled / hash-partitioned. - * @param ch_out Output channel of `TableChunk`s. - * @param left_on Column indices of the keys in the left table. - * @param right_on Column indices of the keys in the right table. - * @param tag Disambiguating tag for the broadcast of the left table. - * @param keep_keys Does the result contain the key columns, or only "carrier" value - * columns - * @return Coroutine representing the completion of the join. - * - * @note This implementation assumes that: - * - `left` is small and fits in memory - * - `right` is shuffled / hash-partitioned - * It doesn't implement build table reuse across chunks of `left`, because we assume that - * `right` is too large to broadcast. - */ streaming::Node left_semi_join_broadcast_left( std::shared_ptr ctx, std::shared_ptr left, diff --git a/cpp/benchmarks/streaming/ndsh/join.hpp b/cpp/benchmarks/streaming/ndsh/join.hpp index 30fc86bc3..07931926e 100644 --- a/cpp/benchmarks/streaming/ndsh/join.hpp +++ b/cpp/benchmarks/streaming/ndsh/join.hpp @@ -126,10 +126,11 @@ enum class KeepKeys : bool { * @note This performs a broadcast join, broadcasting the table represented by the `left` * channel to all ranks, and then streaming through the chunks of the `right` channel. * The `right` channel is required to provide hash-partitioned data in-order. + * All of the chunks from the `left` channel must fit in memory at once. * * @param ctx Streaming context. - * @param left Channel of `TableChunk`s in hash-partitioned order. - * @param right Channel of `TableChunk`s in matching hash-partitioned order. + * @param left Channel of `TableChunk`s. + * @param right Channel of `TableChunk`s in hash-partitioned order (shuffled). * @param ch_out Output channel of `TableChunk`s. * @param left_on Column indices of the keys in the left table. * @param right_on Column indices of the keys in the right table. From 372851ac6f121567d3d1064356612a20164ed200 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 05:57:24 -0800 Subject: [PATCH 64/75] to_device compat --- cpp/benchmarks/streaming/ndsh/join.cpp | 12 +++++------ cpp/benchmarks/streaming/ndsh/q04.cpp | 29 ++++++++++++-------------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index b7b2ca70f..3216281e1 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -425,9 +425,9 @@ streaming::Node left_semi_join_broadcast_left( streaming::ShutdownAtExit c{left, right, ch_out}; co_await ctx->executor()->schedule(); ctx->comm()->logger().print("Inner broadcast join ", static_cast(tag)); - auto left_table = co_await to_device( - ctx, (co_await broadcast(ctx, left, tag)).release() - ); + auto left_table = co_await (co_await broadcast(ctx, left, tag)) + .release() + .make_available(ctx); ctx->comm()->logger().print( "Left (probe) table has ", left_table.table_view().num_rows(), " rows" ); @@ -440,7 +440,7 @@ streaming::Node left_semi_join_broadcast_left( break; } auto right_chunk = - co_await to_device(ctx, right_msg.release()); + co_await right_msg.release().make_available(ctx); co_await ch_out->send(semi_join_chunk( ctx, left_table, @@ -486,9 +486,9 @@ streaming::Node left_semi_join_shuffle( ); auto left_chunk = - co_await to_device(ctx, left_msg.release()); + co_await left_msg.release().make_available(ctx); auto right_chunk = - co_await to_device(ctx, right_msg.release()); + co_await right_msg.release().make_available(ctx); left_event.record(left_chunk.stream()); diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 2d032e964..cb9a47f4a 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -90,9 +90,8 @@ rapidsmpf::streaming::Node final_groupby_agg( RAPIDSMPF_EXPECTS( (co_await ch_in->receive()).empty(), "Expecting concatenated input at this point" ); - auto chunk = co_await rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); + auto chunk = + co_await msg.release().make_available(ctx); auto stream = chunk.stream(); auto mr = ctx->br()->device_mr(); auto table = chunk.table_view(); @@ -150,9 +149,8 @@ rapidsmpf::streaming::Node sort_by( if (msg.empty()) { co_return; } - auto chunk = co_await rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); + auto chunk = + co_await msg.release().make_available(ctx); auto stream = chunk.stream(); auto mr = ctx->br()->device_mr(); auto table = chunk.table_view(); @@ -200,9 +198,8 @@ rapidsmpf::streaming::Node select_columns( break; } co_await ctx->executor()->schedule(); - auto chunk = co_await rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); + auto chunk = + co_await msg.release().make_available(ctx); auto chunk_stream = chunk.stream(); auto sequence_number = msg.sequence_number(); auto table = chunk.table_view(); @@ -404,9 +401,8 @@ rapidsmpf::streaming::Node filter_lineitem( break; } co_await ctx->executor()->schedule(); - auto chunk = co_await rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); + auto chunk = + co_await msg.release().make_available(ctx); auto chunk_stream = chunk.stream(); auto table = chunk.table_view(); @@ -450,10 +446,11 @@ rapidsmpf::streaming::Node fanout_bounded( if (msg.empty()) { break; } - auto chunk = co_await rapidsmpf::ndsh::to_device( - ctx, msg.release() - ); - // Here, we know that copying ch1_cols (a single col) is better than copying + auto chunk = + co_await msg.release().make_available( + ctx + ); // Here, we know that copying ch1_cols (a single col) is better than + // copying // ch2_cols (the whole table) std::vector> tasks; if (!ch1_out->is_shutdown()) { From 202b7f7a328fec569de057082041a00f0c2d4ce0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 07:04:38 -0800 Subject: [PATCH 65/75] clarify hash-partitioning --- cpp/benchmarks/streaming/ndsh/join.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 3216281e1..19862755c 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -439,6 +439,10 @@ streaming::Node left_semi_join_broadcast_left( if (right_msg.empty()) { break; } + // The ``right`` table has been hash-partitioned (via a shuffle) on + // the join key. Thanks to the hash-partitioning, we don't need to worry + // about deduplicating matches across partitions. Anything that matches + // in the semi-join belongs in the output. auto right_chunk = co_await right_msg.release().make_available(ctx); co_await ch_out->send(semi_join_chunk( From 206337cb4ab34fdfc8ca47365301abd387f6da73 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 07:21:40 -0800 Subject: [PATCH 66/75] add run-and-validate --- cpp/scripts/ndsh.py | 111 ++++++++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 39 deletions(-) diff --git a/cpp/scripts/ndsh.py b/cpp/scripts/ndsh.py index 400d1757a..e021f3ae1 100755 --- a/cpp/scripts/ndsh.py +++ b/cpp/scripts/ndsh.py @@ -595,6 +595,21 @@ def cmd_run(args: argparse.Namespace) -> int: return int(failed > 0) +def cmd_run_and_validate(args: argparse.Namespace) -> int: + """Execute the 'run-and-validate' subcommand.""" + # First run the benchmarks + run_result = cmd_run(args) + if run_result != 0: + print("\nRun phase failed, skipping validation.") + return run_result + + # Set up paths for validation based on run output + args.results_path = args.output_dir / "output" + args.expected_path = args.output_dir / "expected" + + return cmd_validate(args) + + def cmd_validate(args: argparse.Namespace) -> int: """Execute the 'validate' subcommand.""" if not args.results_path.exists(): @@ -681,13 +696,9 @@ def main(): subparsers = parser.add_subparsers(dest="command", required=True) - # 'run' subcommand - run_parser = subparsers.add_parser( - "run", - help="Run benchmarks and generate expected results", - description="Run C++ benchmark binaries and generate expected results via DuckDB.", - ) - run_parser.add_argument( + # Parent parser for run-related arguments + run_parent = argparse.ArgumentParser(add_help=False) + run_parent.add_argument( "--benchmark-dir", type=Path, help="Directory containing benchmark binaries (q04, q09, etc.)", @@ -695,7 +706,7 @@ def main(): "cpp/build/benchmarks/ndsh" ), ) - run_parser.add_argument( + run_parent.add_argument( "--sql-dir", type=Path, help="Directory containing SQL query files (q04.sql, q09.sql, etc.)", @@ -703,104 +714,126 @@ def main(): "cpp/benchmarks/streaming/ndsh/sql" ), ) - run_parser.add_argument( + run_parent.add_argument( "--input-dir", type=Path, required=True, help="Directory containing TPC-H input parquet files", ) - run_parser.add_argument( + run_parent.add_argument( "--output-dir", type=Path, required=True, help="Directory for output files", ) - parser.add_argument( + run_parent.add_argument( "-q", "--queries", help="Comma-separated list of SQL query numbers to run or the string 'all'", type=query_type, default="all", ) - run_parser.add_argument( + run_parent.add_argument( "--benchmark-args", type=str, default="", help="Additional arguments to pass to benchmark binaries (space-separated)", ) - run_parser.add_argument( + run_parent.add_argument( "--reuse-expected", action="store_true", help="Skip generating expected results if the expected file already exists", ) - run_parser.add_argument( + run_parent.add_argument( "--reuse-output", action="store_true", help="Skip running the benchmark if the output file already exists", ) - run_parser.add_argument( + run_parent.add_argument( "--generate-data", action="store_true", help="Generate data for the benchmarks", ) - # 'validate' subcommand - validate_parser = subparsers.add_parser( - "validate", - help="Compare results against expected", - description="Validate benchmark results by comparing parquet files against expected results.", - ) - validate_parser.add_argument( - "--results-path", - type=Path, - required=True, - help="Directory containing benchmark result parquet files (qDD.parquet)", - ) - validate_parser.add_argument( - "--expected-path", - type=Path, - required=True, - help="Directory containing expected parquet files (qDD.parquet)", - ) - validate_parser.add_argument( + # Parent parser for validation comparison options (not the paths) + validate_options_parent = argparse.ArgumentParser(add_help=False) + validate_options_parent.add_argument( "-d", "--decimal", type=int, default=2, help="Number of decimal places to compare for floating point values (default: 2)", ) - validate_parser.add_argument( + validate_options_parent.add_argument( "--ignore-timezone", action="store_true", help="Ignore differences in timezone and precision for timestamp types", ) - validate_parser.add_argument( + validate_options_parent.add_argument( "--ignore-string-type", action="store_true", help="Ignore differences between string and large_string types", ) - validate_parser.add_argument( + validate_options_parent.add_argument( "--ignore-integer-sign", action="store_true", help="Ignore differences between signed and unsigned integer types", ) - validate_parser.add_argument( + validate_options_parent.add_argument( "--ignore-integer-size", action="store_true", help="Ignore differences in integer bit width (e.g., int32 vs int64)", ) - validate_parser.add_argument( + validate_options_parent.add_argument( "--ignore-decimal-int", action="store_true", help="Ignore differences between decimal and integer types", ) + # 'run' subcommand - inherits from run_parent + subparsers.add_parser( + "run", + parents=[run_parent], + help="Run benchmarks and generate expected results", + description="Run C++ benchmark binaries and generate expected results via DuckDB.", + ) + + # 'validate' subcommand - inherits comparison options, adds its own paths + validate_parser = subparsers.add_parser( + "validate", + parents=[validate_options_parent], + help="Compare results against expected", + description="Validate benchmark results by comparing parquet files against expected results.", + ) + validate_parser.add_argument( + "--results-path", + type=Path, + required=True, + help="Directory containing benchmark result parquet files (qDD.parquet)", + ) + validate_parser.add_argument( + "--expected-path", + type=Path, + required=True, + help="Directory containing expected parquet files (qDD.parquet)", + ) + + # 'run-and-validate' subcommand - inherits from BOTH parents + subparsers.add_parser( + "run-and-validate", + parents=[run_parent, validate_options_parent], + help="Run benchmarks and validate results in one step", + description="Run C++ benchmark binaries, generate expected results via DuckDB, and validate.", + ) + args = parser.parse_args() if args.command == "run": sys.exit(cmd_run(args)) elif args.command == "validate": sys.exit(cmd_validate(args)) + elif args.command == "run-and-validate": + sys.exit(cmd_run_and_validate(args)) if __name__ == "__main__": From 17ed006b04d7986f384c59c0420d59bb71bb640d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 07:22:21 -0800 Subject: [PATCH 67/75] reuse chunkwise_group_by --- cpp/benchmarks/streaming/ndsh/q04.cpp | 79 ++++++--------------------- 1 file changed, 17 insertions(+), 62 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index cb9a47f4a..5644390ad 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -65,67 +65,15 @@ std::vector chunkwise_groupby_requests() { return requests; } -/* Perform final groupby aggregation. - -Since the cardinality of o_orderpriority is very low, the chunkwise groupby -produces a set of small tables that fits comfortably in memory. We can perform -regular cudf groupby operations instead of streaming versions. - -Input table: - - o_orderpriority - - order_count (partial counts from chunkwise groupby) - -Output table: - - o_orderpriority - - order_count (sum of partial counts) -*/ -rapidsmpf::streaming::Node final_groupby_agg( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - RAPIDSMPF_EXPECTS( - (co_await ch_in->receive()).empty(), "Expecting concatenated input at this point" - ); - auto chunk = - co_await msg.release().make_available(ctx); - auto stream = chunk.stream(); - auto mr = ctx->br()->device_mr(); - auto table = chunk.table_view(); - - // Perform groupby sum aggregation - auto grouper = cudf::groupby::groupby( - table.select({0}), cudf::null_policy::INCLUDE, cudf::sorted::NO - ); - std::vector> aggs; - aggs.push_back(cudf::make_sum_aggregation()); - std::vector requests; - requests.push_back( - cudf::groupby::aggregation_request{ - .values = table.column(1), .aggregations = std::move(aggs) - } - ); - auto [keys, aggregated] = grouper.aggregate(requests, stream, mr); - - // Build result table - auto result = keys->release(); - for (auto&& a : aggregated) { - std::ranges::move(a.results, std::back_inserter(result)); - } - std::ignore = std::move(chunk); - - co_await ch_out->send( - rapidsmpf::streaming::to_message( - msg.sequence_number(), - std::make_unique( - std::make_unique(std::move(result)), stream - ) - ) - ); - co_await ch_out->drain(ctx->executor()); +std::vector final_groupby_requests() { + auto requests = std::vector(); + std::vector()>> aggs; + // sum of partial counts + aggs.emplace_back([]() { + return cudf::make_sum_aggregation(); + }); + requests.emplace_back(1, std::move(aggs)); // column 1 is order_count + return requests; } /* Sort the grouped orders table by o_orderpriority. @@ -713,7 +661,14 @@ int main(int argc, char** argv) { if (ctx->comm()->rank() == 0) { auto final_groupby_output = ctx->create_channel(); nodes.push_back( - final_groupby_agg(ctx, final_groupby_input, final_groupby_output) + rapidsmpf::ndsh::chunkwise_group_by( + ctx, + final_groupby_input, + final_groupby_output, + {0}, + final_groupby_requests(), + cudf::null_policy::INCLUDE + ) ); auto sorted_output = ctx->create_channel(); nodes.push_back(sort_by(ctx, final_groupby_output, sorted_output)); From a9613e26f60cce120b563c4bc8bbe0349f953b29 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 07:43:58 -0800 Subject: [PATCH 68/75] fixed queries parsing --- cpp/scripts/ndsh.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cpp/scripts/ndsh.py b/cpp/scripts/ndsh.py index e021f3ae1..ced13cb10 100755 --- a/cpp/scripts/ndsh.py +++ b/cpp/scripts/ndsh.py @@ -630,6 +630,17 @@ def cmd_validate(args: argparse.Namespace) -> int: print(f"No qDD.parquet files found in results directory: {args.results_path}") return 1 + # Filter to specific queries if requested + if args.queries: + results_files = { + name: path + for name, path in results_files.items() + if int(name.lstrip("q")) in args.queries + } + if not results_files: + print(f"No matching result files found for queries: {args.queries}") + return 1 + print(f"\nValidating {len(results_files)} query(ies):") # Validate each matching pair @@ -817,6 +828,13 @@ def main(): required=True, help="Directory containing expected parquet files (qDD.parquet)", ) + validate_parser.add_argument( + "-q", + "--queries", + help="Comma-separated list of SQL query numbers to validate or the string 'all'", + type=query_type, + default="all", + ) # 'run-and-validate' subcommand - inherits from BOTH parents subparsers.add_parser( From 7a135e24835325fe4d8f61f563fc9dc92122460f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 07:45:35 -0800 Subject: [PATCH 69/75] Handle dates --- cpp/benchmarks/streaming/ndsh/q04.cpp | 147 +++++------------------- cpp/benchmarks/streaming/ndsh/utils.hpp | 95 +++++++++++++++ 2 files changed, 126 insertions(+), 116 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 5644390ad..4afc5d300 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -25,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -204,7 +202,8 @@ rapidsmpf::streaming::Node read_orders( std::shared_ptr ch_out, std::size_t num_producers, cudf::size_type num_rows_per_chunk, - std::string const& input_directory + std::string const& input_directory, + bool use_date32 ) { auto files = rapidsmpf::ndsh::detail::list_parquet_files( rapidsmpf::ndsh::detail::get_table_path(input_directory, "orders") @@ -216,120 +215,25 @@ rapidsmpf::streaming::Node read_orders( }) .build(); - // Build the filter expression 1993-07-01 <= o_orderdate < 1993-10-01 - cudf::timestamp_ms ts1{ - cuda::std::chrono::duration_cast( - cuda::std::chrono::sys_days( - cuda::std::chrono::year_month_day( - cuda::std::chrono::year(1993), - cuda::std::chrono::month(7), - cuda::std::chrono::day(1) - ) - ) - .time_since_epoch() - ) - }; - cudf::timestamp_ms ts2{ - cuda::std::chrono::duration_cast( - cuda::std::chrono::sys_days( - cuda::std::chrono::year_month_day( - cuda::std::chrono::year(1993), - cuda::std::chrono::month(10), - cuda::std::chrono::day(1) - ) - ) - .time_since_epoch() - ) - }; - - /* This vector will have the references for the expression `a < column < b` as - - 0: column_reference to o_orderdate - 1: scalar - 2: scalar - 3: literal - 4: literal - 5: operation GE - 6: operation LT - 7: operation AND - */ - - auto owner = new std::vector; - auto filter_stream = ctx->br()->stream_pool().get_stream(); - // 0 - owner->push_back( - std::make_shared( - "o_orderdate" - ) // position in the table - ); - - - // 1, 2: Scalars - owner->push_back( - std::make_shared>( - ts1, true, filter_stream - ) + auto stream = ctx->br()->stream_pool().get_stream(); + // 1993-07-01 <= o_orderdate < 1993-10-01 + constexpr auto start_date = cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1993), + cuda::std::chrono::month(7), + cuda::std::chrono::day(1) ); - owner->push_back( - std::make_shared>( - ts2, true, filter_stream - ) - ); - - // 3, 4: Literals - owner->push_back( - std::make_shared( - *std::any_cast>>( - owner->at(1) - ) - ) - ); - owner->push_back( - std::make_shared( - *std::any_cast>>( - owner->at(2) - ) - ) - ); - - // 5: (GE, column, literal) - owner->push_back( - std::make_shared( - cudf::ast::ast_operator::GREATER_EQUAL, - *std::any_cast>( - owner->at(0) - ), - *std::any_cast>(owner->at(3)) - ) - ); - - // 6 (LT, column, literal) - owner->push_back( - std::make_shared( - cudf::ast::ast_operator::LESS, - *std::any_cast>( - owner->at(0) - ), - *std::any_cast>(owner->at(4)) - ) - ); - - // 7 (AND, GE, LT) - owner->push_back( - std::make_shared( - cudf::ast::ast_operator::LOGICAL_AND, - *std::any_cast>(owner->at(5)), - *std::any_cast>(owner->at(6)) - ) - ); - - auto filter = std::make_unique( - filter_stream, - *std::any_cast>(owner->back()), - rapidsmpf::OwningWrapper(static_cast(owner), [](void* p) { - delete static_cast*>(p); - }) + constexpr auto end_date = cuda::std::chrono::year_month_day( + cuda::std::chrono::year(1993), + cuda::std::chrono::month(10), + cuda::std::chrono::day(1) ); + auto filter = use_date32 + ? rapidsmpf::ndsh::make_date_range_filter( + stream, start_date, end_date, "o_orderdate" + ) + : rapidsmpf::ndsh::make_date_range_filter( + stream, start_date, end_date, "o_orderdate" + ); return rapidsmpf::streaming::node::read_parquet( ctx, ch_out, num_producers, options, num_rows_per_chunk, std::move(filter) @@ -490,6 +394,12 @@ int main(int argc, char** argv) { std::string output_path = arguments.output_file; std::vector timings; + // Detect date column types from parquet metadata before timed section + auto const orders_types = + rapidsmpf::ndsh::detail::get_column_types(arguments.input_directory, "orders"); + bool const orders_use_date32 = + orders_types.at("o_orderdate").id() == cudf::type_id::TIMESTAMP_DAYS; + int l2size; int device; RAPIDSMPF_CUDA_TRY(cudaGetDevice(&device)); @@ -534,7 +444,12 @@ int main(int argc, char** argv) { filter_lineitem(ctx, lineitem, filtered_lineitem) ); // l_orderkey nodes.push_back(read_orders( - ctx, order, 4, arguments.num_rows_per_chunk, arguments.input_directory + ctx, + order, + 4, + arguments.num_rows_per_chunk, + arguments.input_directory, + orders_use_date32 )); // Fanout filtered orders: one for bloom filter, one for join diff --git a/cpp/benchmarks/streaming/ndsh/utils.hpp b/cpp/benchmarks/streaming/ndsh/utils.hpp index a3d9b09d9..c125a1421 100644 --- a/cpp/benchmarks/streaming/ndsh/utils.hpp +++ b/cpp/benchmarks/streaming/ndsh/utils.hpp @@ -128,6 +128,101 @@ std::unique_ptr make_date_filter( ); } +/** + * @brief Create a date range filter expression. + * + * Creates a filter that checks if a date column falls within a half-open range. + * The operation will be equivalent to + * " >= DATE '' AND < DATE ''". + * + * @tparam timestamp_type The timestamp type to use for the filter scalars + * (e.g., cudf::timestamp_D or cudf::timestamp_ms) + * @param stream CUDA stream to use + * @param start_date The start date (inclusive) of the range + * @param end_date The end date (exclusive) of the range + * @param column_name The name of the column to compare + * @return Filter expression with proper lifetime management + */ +template +std::unique_ptr make_date_range_filter( + rmm::cuda_stream_view stream, + cuda::std::chrono::year_month_day start_date, + cuda::std::chrono::year_month_day end_date, + std::string const& column_name +) { + auto owner = new std::vector; + + // 0: column_reference + owner->push_back(std::make_shared(column_name)); + + // 1, 2: Scalars for start and end dates + owner->push_back( + std::make_shared>( + cuda::std::chrono::sys_days(start_date).time_since_epoch(), true, stream + ) + ); + owner->push_back( + std::make_shared>( + cuda::std::chrono::sys_days(end_date).time_since_epoch(), true, stream + ) + ); + + // 3, 4: Literals for start and end dates + owner->push_back( + std::make_shared( + *std::any_cast>>( + owner->at(1) + ) + ) + ); + owner->push_back( + std::make_shared( + *std::any_cast>>( + owner->at(2) + ) + ) + ); + + // 5: (GE, column, literal) + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::GREATER_EQUAL, + *std::any_cast>( + owner->at(0) + ), + *std::any_cast>(owner->at(3)) + ) + ); + + // 6: (LT, column, literal) + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::LESS, + *std::any_cast>( + owner->at(0) + ), + *std::any_cast>(owner->at(4)) + ) + ); + + // 7: (AND, GE, LT) + owner->push_back( + std::make_shared( + cudf::ast::ast_operator::LOGICAL_AND, + *std::any_cast>(owner->at(5)), + *std::any_cast>(owner->at(6)) + ) + ); + + return std::make_unique( + stream, + *std::any_cast>(owner->back()), + OwningWrapper(static_cast(owner), [](void* p) { + delete static_cast*>(p); + }) + ); +} + /** * @brief Sink messages into a channel and discard them. * From 026c4eca67441398150b5f95a054f0e9fc4fd09c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 07:52:43 -0800 Subject: [PATCH 70/75] reuse chunkwise_sort_by --- cpp/benchmarks/streaming/ndsh/q04.cpp | 61 ++++++--------------------- 1 file changed, 12 insertions(+), 49 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 4afc5d300..151fdbdb2 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -46,6 +45,7 @@ #include "groupby.hpp" #include "join.hpp" #include "parquet_writer.hpp" +#include "sort.hpp" #include "utils.hpp" namespace { @@ -74,53 +74,6 @@ std::vector final_groupby_requests() { return requests; } -/* Sort the grouped orders table by o_orderpriority. - -Input table: - - o_orderpriority - - order_count - -Output table: - - o_orderpriority (sorted ascending) - - order_count -*/ -rapidsmpf::streaming::Node sort_by( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - co_return; - } - auto chunk = - co_await msg.release().make_available(ctx); - auto stream = chunk.stream(); - auto mr = ctx->br()->device_mr(); - auto table = chunk.table_view(); - - auto sorted_table = cudf::sort_by_key( - table, - table.select({0}), - {cudf::order::ASCENDING}, - {cudf::null_order::BEFORE}, - stream, - mr - ); - - co_await ch_out->send( - rapidsmpf::streaming::to_message( - msg.sequence_number(), - std::make_unique( - std::move(sorted_table), stream - ) - ) - ); - co_await ch_out->drain(ctx->executor()); -} - /* Select the columns after the join Input table: @@ -586,7 +539,17 @@ int main(int argc, char** argv) { ) ); auto sorted_output = ctx->create_channel(); - nodes.push_back(sort_by(ctx, final_groupby_output, sorted_output)); + nodes.push_back( + rapidsmpf::ndsh::chunkwise_sort_by( + ctx, + final_groupby_output, + sorted_output, + {0}, + {0, 1}, + {cudf::order::ASCENDING}, + {cudf::null_order::BEFORE} + ) + ); nodes.push_back( rapidsmpf::ndsh::write_parquet( ctx, From 52b3934f50996ccd0c33e12077eb4f6413d460ad Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 08:02:23 -0800 Subject: [PATCH 71/75] simplify --- cpp/benchmarks/streaming/ndsh/q04.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 151fdbdb2..1986047fb 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -103,17 +103,9 @@ rapidsmpf::streaming::Node select_columns( auto sequence_number = msg.sequence_number(); auto table = chunk.table_view(); - std::vector> result; - result.reserve(indices.size()); - for (auto idx : indices) { - result.push_back( - std::make_unique( - table.column(idx), chunk_stream, ctx->br()->device_mr() - ) - ); - } - - auto result_table = std::make_unique(std::move(result)); + auto result_table = std::make_unique( + chunk.table_view().select(indices), chunk_stream, ctx->br()->device_mr() + ); co_await ch_out->send( rapidsmpf::streaming::to_message( From c01a3776b7aeb28bf666ea7ecbbfa24c66fad0c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 08:04:01 -0800 Subject: [PATCH 72/75] simplify --- cpp/benchmarks/streaming/ndsh/q04.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 1986047fb..7dd17a3ba 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -89,6 +89,7 @@ rapidsmpf::streaming::Node select_columns( std::vector indices ) { rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); while (!ch_out->is_shutdown()) { auto msg = co_await ch_in->receive(); @@ -96,7 +97,6 @@ rapidsmpf::streaming::Node select_columns( ctx->comm()->logger().debug("Select columns: no more input"); break; } - co_await ctx->executor()->schedule(); auto chunk = co_await msg.release().make_available(ctx); auto chunk_stream = chunk.stream(); @@ -191,13 +191,14 @@ rapidsmpf::streaming::Node filter_lineitem( std::shared_ptr ch_out ) { rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; + co_await ctx->executor()->schedule(); auto mr = ctx->br()->device_mr(); + while (!ch_out->is_shutdown()) { auto msg = co_await ch_in->receive(); if (msg.empty()) { break; } - co_await ctx->executor()->schedule(); auto chunk = co_await msg.release().make_available(ctx); auto chunk_stream = chunk.stream(); @@ -236,8 +237,8 @@ rapidsmpf::streaming::Node fanout_bounded( std::shared_ptr ch2_out ) { rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch1_out, ch2_out}; - co_await ctx->executor()->schedule(); + while (true) { auto msg = co_await ch_in->receive(); if (msg.empty()) { From a2d46be4360be778ed965cf6880814bd1589f858 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 08:10:04 -0800 Subject: [PATCH 73/75] static casts --- cpp/benchmarks/streaming/ndsh/q04.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 7dd17a3ba..7007e2617 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -445,7 +445,7 @@ int main(int argc, char** argv) { filtered_lineitem_shuffled, {0}, num_partitions, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} + static_cast(10 * i + op_id++) ) ); @@ -458,7 +458,7 @@ int main(int argc, char** argv) { filtered_order_shuffled, {0}, num_partitions, - rapidsmpf::OpID{static_cast(10 * i + op_id++)} + static_cast(10 * i + op_id++) ) ); @@ -481,7 +481,7 @@ int main(int argc, char** argv) { orders_x_lineitem, {0}, {0}, - rapidsmpf::OpID{static_cast(10 * i + op_id++)}, + static_cast(10 * i + op_id++), rapidsmpf::ndsh::KeepKeys::YES ) ); From 4ddc8532d39f2c11c240527e8aa266052c80940c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 08:39:39 -0800 Subject: [PATCH 74/75] Use KeepKeys::NO --- cpp/benchmarks/streaming/ndsh/join.cpp | 44 ++++++++++++++++--- cpp/benchmarks/streaming/ndsh/join.hpp | 3 +- cpp/benchmarks/streaming/ndsh/q04.cpp | 60 ++------------------------ 3 files changed, 43 insertions(+), 64 deletions(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 19862755c..7502a2bdb 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -151,16 +151,19 @@ streaming::Node broadcast( * @param ctx Streaming context * @param left_chunk Chunk to join. Used as the probe table in a filtered join. * @param right_chunk Chunk to join. Used as the build table in a filtered join. - * @param right_on Key column indices in `left_chunk`. + * @param left_carrier Columns from `left_chunk` to include in the output. + * @param left_on Key column indices in `left_chunk`. * @param right_on Key column indices in `right_chunk`. * @param sequence Sequence number of the output + * @param left_event Event recording the availability of `left_chunk`. * - * @return Message of `TableChunk` containing the result of the inner join. + * @return Message of `TableChunk` containing the result of the semi join. */ streaming::Message semi_join_chunk( std::shared_ptr ctx, streaming::TableChunk const& left_chunk, streaming::TableChunk&& right_chunk, + cudf::table_view left_carrier, std::vector left_on, std::vector right_on, std::uint64_t sequence, @@ -193,7 +196,7 @@ streaming::Message semi_join_chunk( cudf::column_view indices = cudf::device_span(*match); auto result_columns = cudf::gather( - left_chunk.table_view(), + left_carrier, indices, cudf::out_of_bounds_policy::DONT_CHECK, chunk_stream, @@ -420,11 +423,11 @@ streaming::Node left_semi_join_broadcast_left( std::vector left_on, std::vector right_on, OpID tag, - [[maybe_unused]] KeepKeys keep_keys + KeepKeys keep_keys ) { streaming::ShutdownAtExit c{left, right, ch_out}; co_await ctx->executor()->schedule(); - ctx->comm()->logger().print("Inner broadcast join ", static_cast(tag)); + ctx->comm()->logger().print("Left semi broadcast join ", static_cast(tag)); auto left_table = co_await (co_await broadcast(ctx, left, tag)) .release() .make_available(ctx); @@ -434,6 +437,19 @@ streaming::Node left_semi_join_broadcast_left( CudaEvent left_event; left_event.record(left_table.stream()); + cudf::table_view left_carrier; + if (keep_keys == KeepKeys::YES) { + left_carrier = left_table.table_view(); + } else { + std::vector to_keep; + std::ranges::copy_if( + std::ranges::iota_view(0, left_table.table_view().num_columns()), + std::back_inserter(to_keep), + [&](auto i) { return std::ranges::find(left_on, i) == left_on.end(); } + ); + left_carrier = left_table.table_view().select(to_keep); + } + while (!ch_out->is_shutdown()) { auto right_msg = co_await right->receive(); if (right_msg.empty()) { @@ -449,6 +465,7 @@ streaming::Node left_semi_join_broadcast_left( ctx, left_table, std::move(right_chunk), + left_carrier, left_on, right_on, right_msg.sequence_number(), @@ -465,7 +482,8 @@ streaming::Node left_semi_join_shuffle( std::shared_ptr right, std::shared_ptr ch_out, std::vector left_on, - std::vector right_on + std::vector right_on, + KeepKeys keep_keys ) { streaming::ShutdownAtExit c{left, right, ch_out}; ctx->comm()->logger().print("Shuffle left semi join"); @@ -496,10 +514,24 @@ streaming::Node left_semi_join_shuffle( left_event.record(left_chunk.stream()); + cudf::table_view left_carrier; + if (keep_keys == KeepKeys::YES) { + left_carrier = left_chunk.table_view(); + } else { + std::vector to_keep; + std::ranges::copy_if( + std::ranges::iota_view(0, left_chunk.table_view().num_columns()), + std::back_inserter(to_keep), + [&](auto i) { return std::ranges::find(left_on, i) == left_on.end(); } + ); + left_carrier = left_chunk.table_view().select(to_keep); + } + co_await ch_out->send(semi_join_chunk( ctx, left_chunk, std::move(right_chunk), + left_carrier, left_on, right_on, left_msg.sequence_number(), diff --git a/cpp/benchmarks/streaming/ndsh/join.hpp b/cpp/benchmarks/streaming/ndsh/join.hpp index 07931926e..253ce0b3c 100644 --- a/cpp/benchmarks/streaming/ndsh/join.hpp +++ b/cpp/benchmarks/streaming/ndsh/join.hpp @@ -177,7 +177,8 @@ streaming::Node left_semi_join_shuffle( std::shared_ptr right, std::shared_ptr ch_out, std::vector left_on, - std::vector right_on + std::vector right_on, + KeepKeys keep_keys = KeepKeys::YES ); /** diff --git a/cpp/benchmarks/streaming/ndsh/q04.cpp b/cpp/benchmarks/streaming/ndsh/q04.cpp index 7007e2617..e5b6b63d1 100644 --- a/cpp/benchmarks/streaming/ndsh/q04.cpp +++ b/cpp/benchmarks/streaming/ndsh/q04.cpp @@ -74,51 +74,6 @@ std::vector final_groupby_requests() { return requests; } -/* Select the columns after the join - -Input table: - -- o_orderkey -- o_orderpriority - -*/ -rapidsmpf::streaming::Node select_columns( - std::shared_ptr ctx, - std::shared_ptr ch_in, - std::shared_ptr ch_out, - std::vector indices -) { - rapidsmpf::streaming::ShutdownAtExit c{ch_in, ch_out}; - co_await ctx->executor()->schedule(); - - while (!ch_out->is_shutdown()) { - auto msg = co_await ch_in->receive(); - if (msg.empty()) { - ctx->comm()->logger().debug("Select columns: no more input"); - break; - } - auto chunk = - co_await msg.release().make_available(ctx); - auto chunk_stream = chunk.stream(); - auto sequence_number = msg.sequence_number(); - auto table = chunk.table_view(); - - auto result_table = std::make_unique( - chunk.table_view().select(indices), chunk_stream, ctx->br()->device_mr() - ); - - co_await ch_out->send( - rapidsmpf::streaming::to_message( - sequence_number, - std::make_unique( - std::move(result_table), chunk_stream - ) - ) - ); - } - co_await ch_out->drain(ctx->executor()); -} - rapidsmpf::streaming::Node read_lineitem( std::shared_ptr ctx, std::shared_ptr ch_out, @@ -373,13 +328,8 @@ int main(int argc, char** argv) { // [o_orderkey, o_orderpriority] auto order = ctx->create_channel(); - // [o_orderkey, o_orderpriority] - // Ideally this would *just* be o_orderpriority, pushing the projection - // into the join node / dropping the join key. - auto orders_x_lineitem = ctx->create_channel(); - // [o_orderpriority] - auto projected_columns = ctx->create_channel(); + auto orders_x_lineitem = ctx->create_channel(); // [o_orderpriority, order_count] auto grouped_chunkwise = ctx->create_channel(); @@ -482,19 +432,15 @@ int main(int argc, char** argv) { {0}, {0}, static_cast(10 * i + op_id++), - rapidsmpf::ndsh::KeepKeys::YES + rapidsmpf::ndsh::KeepKeys::NO ) ); } - nodes.push_back( - select_columns(ctx, orders_x_lineitem, projected_columns, {1}) - ); - nodes.push_back( rapidsmpf::ndsh::chunkwise_group_by( ctx, - projected_columns, + orders_x_lineitem, grouped_chunkwise, {0}, chunkwise_groupby_requests(), From ec8b45cc4c4adcf713b5bfc2659e1a465f561e5b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 30 Jan 2026 08:43:47 -0800 Subject: [PATCH 75/75] fix loop --- cpp/benchmarks/streaming/ndsh/join.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/streaming/ndsh/join.cpp b/cpp/benchmarks/streaming/ndsh/join.cpp index 7502a2bdb..f647f3065 100644 --- a/cpp/benchmarks/streaming/ndsh/join.cpp +++ b/cpp/benchmarks/streaming/ndsh/join.cpp @@ -491,7 +491,7 @@ streaming::Node left_semi_join_shuffle( co_await ctx->executor()->schedule(); CudaEvent left_event; - while (true) { + while (!ch_out->is_shutdown()) { // Requirement: two shuffles kick out partitions in the same order auto left_msg = co_await left->receive(); auto right_msg = co_await right->receive();