From 0f3dbc8b2ae09506f554e29ce8a27c27ffddf34c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 6 Nov 2025 07:36:39 -0800 Subject: [PATCH 01/26] Add nvCOMP benchmark --- cpp/benchmarks/CMakeLists.txt | 30 + cpp/benchmarks/bench_comp_comm.cpp | 916 +++++++++++++++++++++++++++++ 2 files changed, 946 insertions(+) create mode 100644 cpp/benchmarks/bench_comp_comm.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index da5bd7be3..01ec5528a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -61,6 +61,36 @@ install( EXCLUDE_FROM_ALL ) +# ---------------------------------------------------------------------------- +# nvCOMP compression + Communicator benchmark +find_package(nvcomp CONFIG REQUIRED) +add_executable(bench_comp_comm "bench_comp_comm.cpp") +set_target_properties( + bench_comp_comm + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + LINK_FLAGS "-Wl,--allow-shlib-undefined" +) +target_compile_options( + bench_comp_comm PRIVATE "$<$:${RAPIDSMPF_CXX_FLAGS}>" + "$<$:${RAPIDSMPF_CUDA_FLAGS}>" +) +target_link_libraries( + bench_comp_comm PRIVATE rapidsmpf::rapidsmpf ucxx::ucxx $ + nvcomp::nvcomp $ maybe_asan bench_utils +) +install( + TARGETS bench_comp_comm + COMPONENT benchmarking + DESTINATION bin/benchmarks/librapidsmpf + EXCLUDE_FROM_ALL +) + # Find or install GoogleBench include(${rapids-cmake-dir}/cpm/gbench.cmake) rapids_cpm_gbench(BUILD_STATIC) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp new file mode 100644 index 000000000..6cc557f32 --- /dev/null +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -0,0 +1,916 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RAPIDSMPF_HAVE_CUPTI +#include +#endif + +#include "utils/misc.hpp" +#include "utils/rmm_stack.hpp" + +// nvCOMP managers (v3.x API) +#include +#include + +using namespace rapidsmpf; + +namespace { + +enum class PackMode { + Table, + Columns +}; +enum class Algo { + Cascaded, + LZ4 +}; + +struct KvParams { + // Common + std::size_t chunk_size{1 << 20}; + // Cascaded + int cascaded_rle{1}; + int cascaded_delta{1}; + int cascaded_bitpack{1}; +}; + +struct Args { + std::string comm_type{"mpi"}; + std::uint64_t num_runs{1}; + std::uint64_t num_warmups{0}; + std::string rmm_mr{"pool"}; + std::string file_pattern; // required + PackMode pack_mode{PackMode::Table}; + Algo algo{Algo::Cascaded}; + KvParams params{}; + std::uint64_t num_ops{1}; + bool enable_cupti_monitoring{false}; + std::string cupti_csv_prefix; +}; + +std::vector expand_glob(std::string const& pattern) { + std::vector files; + glob_t glob_result{}; + int rc = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result); + if (rc == 0) { + for (std::size_t i = 0; i < glob_result.gl_pathc; ++i) { + files.emplace_back(glob_result.gl_pathv[i]); + } + } + globfree(&glob_result); + std::sort(files.begin(), files.end()); + return files; +} + +std::size_t parse_nbytes(std::string const& s) { + // Simple parser: supports suffixes KiB, MiB, GiB, KB, MB, GB, or no suffix. + auto to_lower = [](char c) { return static_cast(std::tolower(c)); }; + std::string v; + v.reserve(s.size()); + for (char c : s) + v.push_back(to_lower(c)); + + std::size_t mult = 1; + if (v.ends_with("kib")) { + mult = 1ull << 10; + v = v.substr(0, v.size() - 3); + } else if (v.ends_with("mib")) { + mult = 1ull << 20; + v = v.substr(0, v.size() - 3); + } else if (v.ends_with("gib")) { + mult = 1ull << 30; + v = v.substr(0, v.size() - 3); + } else if (v.ends_with("kb")) { + mult = 1000ull; + v = v.substr(0, v.size() - 2); + } else if (v.ends_with("mb")) { + mult = 1000ull * 1000ull; + v = v.substr(0, v.size() - 2); + } else if (v.ends_with("gb")) { + mult = 1000ull * 1000ull * 1000ull; + v = v.substr(0, v.size() - 2); + } + + return static_cast(std::stoll(v)) * mult; +} + +KvParams parse_kv_params(std::string const& kv) { + KvParams p{}; + if (kv.empty()) + return p; + std::size_t start = 0; + while (start < kv.size()) { + auto comma = kv.find(',', start); + auto part = kv.substr( + start, comma == std::string::npos ? std::string::npos : comma - start + ); + auto eq = part.find('='); + if (eq != std::string::npos) { + std::string key = part.substr(0, eq); + std::string val = part.substr(eq + 1); + if (key == "chunk_size") + p.chunk_size = parse_nbytes(val); + else if (key == "delta") + p.cascaded_delta = std::stoi(val); + else if (key == "rle") + p.cascaded_rle = std::stoi(val); + else if (key == "bitpack") + p.cascaded_bitpack = std::stoi(val); + } + if (comma == std::string::npos) + break; + start = comma + 1; + } + return p; +} + +struct PhaseThroughputs { + double compress_Bps{0.0}; + double decompress_Bps{0.0}; + double comp_send_Bps{0.0}; + double recv_decomp_Bps{0.0}; + double send_only_Bps{0.0}; + double recv_only_Bps{0.0}; +}; + +class NvcompCodec { + public: + virtual ~NvcompCodec() = default; + virtual std::size_t get_max_compressed_bytes(std::size_t uncompressed_bytes) = 0; + virtual void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) = 0; + virtual void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) = 0; +}; + +class LZ4Codec final : public NvcompCodec { + public: + explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {} + + std::size_t get_max_compressed_bytes(std::size_t in_bytes) override { + nvcomp::LZ4Manager mgr{static_cast(chunk_size_), 0}; + auto cfg = mgr.configure_compression(in_bytes); + return cfg.max_compressed_buffer_size; + } + + void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcomp::LZ4Manager mgr{static_cast(chunk_size_), 0, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + mgr.compress(d_out, d_in, cfg); + // Compressed size is stored at the beginning of output; ask manager + auto info = mgr.get_compress_result(d_out); + *out_bytes = info.compressed_bytes; + } + + void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcomp::LZ4Manager mgr{static_cast(chunk_size_), 0, stream.value()}; + auto cfg = mgr.configure_decompression(d_in, in_bytes); + (void)out_bytes; // decomp size implied by cfg + mgr.decompress(d_out, d_in, cfg); + } + + private: + std::size_t chunk_size_; +}; + +class CascadedCodec final : public NvcompCodec { + public: + CascadedCodec(std::size_t chunk_size, int rle, int delta, int bitpack) + : opts_{rle != 0, delta != 0, bitpack != 0, static_cast(chunk_size)} {} + + std::size_t get_max_compressed_bytes(std::size_t in_bytes) override { + nvcomp::CascadedManager mgr{opts_}; + auto cfg = mgr.configure_compression(in_bytes); + return cfg.max_compressed_buffer_size; + } + + void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcomp::CascadedManager mgr{opts_, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + mgr.compress(d_out, d_in, cfg); + auto info = mgr.get_compress_result(d_out); + *out_bytes = info.compressed_bytes; + } + + void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcomp::CascadedManager mgr{opts_, stream.value()}; + auto cfg = mgr.configure_decompression(d_in, in_bytes); + (void)out_bytes; + mgr.decompress(d_out, d_in, cfg); + } + + private: + nvcomp::CascadedOptions opts_{}; +}; + +std::unique_ptr make_codec(Algo algo, KvParams const& p) { + switch (algo) { + case Algo::LZ4: + return std::make_unique(p.chunk_size); + case Algo::Cascaded: + default: + return std::make_unique( + p.chunk_size, p.cascaded_rle, p.cascaded_delta, p.cascaded_bitpack + ); + } +} + +// Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData +static std::unique_ptr pack_table_to_packed( + cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br +) { + auto packed = cudf::pack(tv, stream, br->device_mr()); + auto metadata = + std::make_unique>(std::move(packed.metadata)); + auto buf = br->move( + std::make_unique(std::move(packed.gpu_data)), stream + ); + return std::make_unique(std::move(metadata), std::move(buf)); +} + +struct ArgumentParser { + ArgumentParser(int argc, char* const* argv, bool use_mpi) { + int rank = 0; + if (use_mpi) { + RAPIDSMPF_EXPECTS(mpi::is_initialized() == true, "MPI is not initialized"); + RAPIDSMPF_MPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); + } + try { + int opt; + // C: comm, r: runs, w: warmups, m: rmm, F: files, P: pack mode, A: algo, K: + // kv, p: ops, M: cupti, h: help + while ((opt = getopt(argc, argv, "C:r:w:m:F:P:A:K:p:M:h")) != -1) { + switch (opt) { + case 'C': + args_.comm_type = std::string{optarg}; + break; + case 'r': + parse_integer(args_.num_runs, optarg); + break; + case 'w': + parse_integer(args_.num_warmups, optarg); + break; + case 'm': + args_.rmm_mr = std::string{optarg}; + break; + case 'F': + args_.file_pattern = std::string{optarg}; + break; + case 'P': + { + std::string v{optarg}; + if (v == "table") + args_.pack_mode = PackMode::Table; + else if (v == "columns") + args_.pack_mode = PackMode::Columns; + else + RAPIDSMPF_FAIL( + "-P must be one of {table, columns}", + std::invalid_argument + ); + break; + } + case 'A': + { + std::string v{optarg}; + if (v == "cascaded") + args_.algo = Algo::Cascaded; + else if (v == "lz4") + args_.algo = Algo::LZ4; + else + RAPIDSMPF_FAIL( + "-A must be one of {cascaded, lz4}", std::invalid_argument + ); + break; + } + case 'K': + args_.params = parse_kv_params(std::string{optarg}); + break; + case 'p': + parse_integer(args_.num_ops, optarg); + break; + case 'M': + args_.enable_cupti_monitoring = true; + args_.cupti_csv_prefix = std::string{optarg}; + break; + case 'h': + default: + { + std::stringstream ss; + ss << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " -C {mpi, ucxx} (default: mpi)\n" + << " -r Number of runs (default: 1)\n" + << " -w Number of warmup runs (default: 0)\n" + << " -m RMM MR {cuda, pool, async, managed} " + "(default: pool)\n" + << " -F Parquet file glob/pattern (required)\n" + << " -P Packing mode {table, columns} (default: " + "table)\n" + << " -A {cascaded, lz4} (default: cascaded)\n" + << " -K Algo params, e.g. " + "chunk_size=1MiB,delta=1,rle=1,bitpack=1\n" + << " -p Number of concurrent ops (default: 1)\n" + << " -M CUPTI CSV path prefix (enable CUPTI)\n" + << " -h Show this help\n"; + if (rank == 0) + std::cerr << ss.str(); + if (use_mpi) + RAPIDSMPF_MPI(MPI_Abort(MPI_COMM_WORLD, 0)); + else + std::exit(0); + } + } + } + } catch (std::exception const& e) { + std::cerr << "Error parsing arguments: " << e.what() << std::endl; + if (use_mpi) + RAPIDSMPF_MPI(MPI_Abort(MPI_COMM_WORLD, -1)); + else + std::exit(-1); + } + if (args_.file_pattern.empty()) { + std::cerr << "-F is required" << std::endl; + if (use_mpi) + RAPIDSMPF_MPI(MPI_Abort(MPI_COMM_WORLD, -1)); + else + std::exit(-1); + } + if (args_.rmm_mr == "cuda") { + if (rank == 0) { + std::cout << "WARNING: using the default cuda memory resource (-m cuda) " + "might leak memory!" + << std::endl; + } + } + } + + Args const& get() const { + return args_; + } + + private: + Args args_{}; +}; + +struct PackedItem { + // Ownership: we store size and buffer pointer for the packed payload + std::unique_ptr packed; // original packed cudf table/column +}; + +struct BuffersToSend { + // For each op, we will send these items + std::vector items; + std::size_t total_uncompressed_bytes{0}; +}; + +BuffersToSend make_packed_items( + cudf::table const& table, + PackMode mode, + rmm::cuda_stream_view stream, + BufferResource* br +) { + BuffersToSend ret{}; + if (mode == PackMode::Table) { + auto item = PackedItem{}; + item.packed = pack_table_to_packed(table.view(), stream, br); + ret.total_uncompressed_bytes += + item.packed->data->size + item.packed->metadata->size(); + ret.items.emplace_back(std::move(item)); + } else { + auto tv = table.view(); + for (cudf::size_type i = 0; i < tv.num_columns(); ++i) { + cudf::table_view col_tv{std::vector{tv.column(i)}}; + auto item = PackedItem{}; + item.packed = pack_table_to_packed(col_tv, stream, br); + ret.total_uncompressed_bytes += + item.packed->data->size + item.packed->metadata->size(); + ret.items.emplace_back(std::move(item)); + } + } + return ret; +} + +// Send/recv helpers: send a header (compressed size) as host buffer. +struct SizeHeader { + std::uint64_t bytes; +}; + +struct Timings { + double compress_s{0.0}; + double decompress_s{0.0}; + double comp_send_s{0.0}; + double recv_decomp_s{0.0}; + double send_only_s{0.0}; + double recv_only_s{0.0}; +}; + +// Returns timings and bytes counters +struct Counters { + std::size_t logical_uncompressed_bytes{0}; + std::size_t logical_compressed_bytes{0}; +}; + +struct RunResult { + Timings times; + Counters counts; +}; + +RunResult run_once( + std::shared_ptr const& comm, + Args const& args, + rmm::cuda_stream_view stream, + BufferResource* br, + std::shared_ptr const& statistics, + BuffersToSend const& data, + NvcompCodec& codec +) { + auto const nranks = comm->nranks(); + auto const rank = comm->rank(); + auto const dst = static_cast((rank + 1) % nranks); + auto const src = static_cast((rank - 1 + nranks) % nranks); + + Tag tag_size{1, 0}; + Tag tag_payload{1, 1}; + Tag tag_nocomp{2, 0}; + + // Clone packed items into raw device buffers for repeated ops + std::vector> nocomp_payloads; + nocomp_payloads.reserve(data.items.size()); + for (auto const& it : data.items) { + // Copy metadata + data into a contiguous device buffer for pure send path? + // For pure send/recv, we only send the device payload; metadata isn't needed for + // metrics. We'll send the packed->data buffer. + auto reservation = br->reserve_or_fail(it.packed->data->size, MemoryType::DEVICE); + auto buf = br->allocate(it.packed->data->size, stream, reservation); + buffer_copy(*buf, *it.packed->data, it.packed->data->size); + nocomp_payloads.emplace_back(std::move(buf)); + } + + // Pre-allocate compression outputs for each item + std::vector> comp_outputs; + std::vector comp_output_sizes(data.items.size()); + comp_outputs.reserve(data.items.size()); + for (std::size_t i = 0; i < data.items.size(); ++i) { + auto const in_bytes = data.items[i].packed->data->size; + auto const max_out = codec.get_max_compressed_bytes(in_bytes); + auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE); + comp_outputs.emplace_back(br->allocate(max_out, stream, reservation)); + } + + RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); + auto t0 = Clock::now(); + // Compress all items (single batch) on stream + for (std::size_t i = 0; i < data.items.size(); ++i) { + void const* d_in = data.items[i].packed->data->device()->get()->data(); + void* d_out = comp_outputs[i]->device()->get()->data(); + std::size_t out_bytes = 0; + codec.compress(d_in, data.items[i].packed->data->size, d_out, &out_bytes, stream); + comp_output_sizes[i] = out_bytes; + } + RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); + auto t1 = Clock::now(); + + // Phase A: pure send/recv (no compression) + auto a0 = Clock::now(); + std::vector> send_futs; + std::vector> recv_futs; + send_futs.reserve(args.num_ops * nocomp_payloads.size()); + recv_futs.reserve(args.num_ops * nocomp_payloads.size()); + + for (std::uint64_t op = 0; op < args.num_ops; ++op) { + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { + // post recv first + if (src != rank) { + auto res = + br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); + auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res); + recv_futs.push_back(comm->recv(src, tag_nocomp, std::move(recv_buf))); + } + } + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { + if (dst != rank) { + auto res = + br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); + auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res); + buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); + send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp)); + } + } + } + while (!send_futs.empty()) { + std::ignore = comm->test_some(send_futs); + } + auto a1 = Clock::now(); + while (!recv_futs.empty()) { + std::ignore = comm->test_some(recv_futs); + } + auto a2 = Clock::now(); + + // Phase B: compressed path (send size header, then compressed payload) + auto b0 = Clock::now(); + std::vector> send_hdr_futs; + std::vector> send_cmp_futs; + std::vector> recv_hdr_futs; + std::vector> recv_cmp_futs; + send_hdr_futs.reserve(args.num_ops * data.items.size()); + send_cmp_futs.reserve(args.num_ops * data.items.size()); + recv_hdr_futs.reserve(args.num_ops * data.items.size()); + recv_cmp_futs.reserve(args.num_ops * data.items.size()); + + for (std::uint64_t op = 0; op < args.num_ops; ++op) { + for (std::size_t i = 0; i < data.items.size(); ++i) { + // post recv header + if (src != rank) { + auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST); + auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h); + recv_hdr_futs.push_back(comm->recv(src, tag_size, std::move(hdr))); + } + } + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (dst != rank) { + auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST); + auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h); + // write header + hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { + SizeHeader h{static_cast(comp_output_sizes[i])}; + std::memcpy(p, &h, sizeof(SizeHeader)); + }); + send_hdr_futs.push_back(comm->send(std::move(hdr), dst, tag_size)); + } + } + } + while (!send_hdr_futs.empty()) { + std::ignore = comm->test_some(send_hdr_futs); + } + while (!recv_hdr_futs.empty()) { + std::ignore = comm->test_some(recv_hdr_futs); + } + + // Post payload recvs now that we know sizes + for (std::uint64_t op = 0; op < args.num_ops; ++op) { + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (src != rank) { + // reuse comp_output_sizes[i] as expected size since peers are symmetric + auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto buf = br->allocate(comp_output_sizes[i], stream, res); + recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf))); + } + } + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (dst != rank) { + // send compressed + auto tmp_buf = + br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf); + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); + send_cmp_futs.push_back( + comm->send(std::move(send_buf), dst, tag_payload) + ); + } + } + } + while (!send_cmp_futs.empty()) { + std::ignore = comm->test_some(send_cmp_futs); + } + auto b1 = Clock::now(); + while (!recv_cmp_futs.empty()) { + std::ignore = comm->test_some(recv_cmp_futs); + } + auto b2 = Clock::now(); + + // Decompress received buffers (simulate by decompressing our own produced outputs in + // symmetric setup) + auto c0 = Clock::now(); + for (std::size_t i = 0; i < data.items.size(); ++i) { + auto const out_bytes = data.items[i].packed->data->size; + auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE); + auto out = br->allocate(out_bytes, stream, res); + void const* d_in = comp_outputs[i]->device()->get()->data(); + void* d_out = out->device()->get()->data(); + codec.decompress(d_in, comp_output_sizes[i], d_out, out_bytes, stream); + } + RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); + auto c1 = Clock::now(); + + RunResult result{}; + result.times.compress_s = std::chrono::duration(t1 - t0).count(); + result.times.send_only_s = std::chrono::duration(a1 - a0).count(); + result.times.recv_only_s = std::chrono::duration(a2 - a1).count(); + result.times.comp_send_s = std::chrono::duration(b1 - b0).count(); + result.times.recv_decomp_s = std::chrono::duration(b2 - b1).count() + + std::chrono::duration(c1 - c0).count(); + result.times.decompress_s = std::chrono::duration(c1 - c0).count(); + + result.counts.logical_uncompressed_bytes = + data.total_uncompressed_bytes * args.num_ops; + result.counts.logical_compressed_bytes = + std::accumulate( + comp_output_sizes.begin(), comp_output_sizes.end(), std::size_t{0} + ) + * args.num_ops; + return result; +} + +} // namespace + +int main(int argc, char** argv) { + // Check if we should use bootstrap mode with rrun + bool use_bootstrap = std::getenv("RAPIDSMPF_RANK") != nullptr; + + int provided = 0; + if (!use_bootstrap) { + RAPIDSMPF_MPI(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided)); + RAPIDSMPF_EXPECTS( + provided == MPI_THREAD_MULTIPLE, + "didn't get the requested thread level support: MPI_THREAD_MULTIPLE" + ); + } + + ArgumentParser parser{argc, argv, !use_bootstrap}; + Args const& args = parser.get(); + + // Initialize configuration options from environment variables. + rapidsmpf::config::Options options{rapidsmpf::config::get_environment_variables()}; + + std::shared_ptr comm; + if (args.comm_type == "mpi") { + if (use_bootstrap) { + std::cerr << "Error: MPI communicator requires MPI initialization. Don't use " + "with rrun or unset RAPIDSMPF_RANK." + << std::endl; + return 1; + } + mpi::init(&argc, &argv); + comm = std::make_shared(MPI_COMM_WORLD, options); + } else if (args.comm_type == "ucxx") { + if (use_bootstrap) { + comm = rapidsmpf::bootstrap::create_ucxx_comm( + rapidsmpf::bootstrap::Backend::AUTO, options + ); + } else { + comm = rapidsmpf::ucxx::init_using_mpi(MPI_COMM_WORLD, options); + } + } else { + std::cerr << "Unknown communicator: " << args.comm_type << std::endl; + return 1; + } + + auto& log = comm->logger(); + rmm::cuda_stream_view stream = cudf::get_default_stream(); + + // RMM setup + auto const mr_stack = set_current_rmm_stack(args.rmm_mr); + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); + BufferResource br{mr}; + + // Hardware info + { + std::stringstream ss; + auto const cur_dev = rmm::get_current_cuda_device().value(); + std::string pci_bus_id(16, '\0'); + RAPIDSMPF_CUDA_TRY( + cudaDeviceGetPCIBusId(pci_bus_id.data(), pci_bus_id.size(), cur_dev) + ); + cudaDeviceProp properties; + RAPIDSMPF_CUDA_TRY(cudaGetDeviceProperties(&properties, 0)); + ss << "Hardware setup: \n"; + ss << " GPU (" << properties.name << "): \n"; + ss << " Device number: " << cur_dev << "\n"; + ss << " PCI Bus ID: " << pci_bus_id.substr(0, pci_bus_id.find('\0')) << "\n"; + ss << " Total Memory: " << format_nbytes(properties.totalGlobalMem, 0) << "\n"; + ss << " Comm: " << *comm << "\n"; + log.print(ss.str()); + } + + // Stats and CUPTI + auto stats = std::make_shared(/* enable = */ false); +#ifdef RAPIDSMPF_HAVE_CUPTI + std::unique_ptr cupti_monitor; + if (args.enable_cupti_monitoring) { + cupti_monitor = std::make_unique(); + cupti_monitor->start_monitoring(); + log.print("CUPTI memory monitoring enabled"); + } +#endif + + // File selection per rank + auto files = expand_glob(args.file_pattern); + if (files.empty()) { + if (comm->rank() == 0) + log.print("No files matched pattern: " + args.file_pattern); + if (!use_bootstrap) + RAPIDSMPF_MPI(MPI_Finalize()); + return 1; + } + auto my_file = files[static_cast(comm->rank()) % files.size()]; + if (comm->rank() == 0) + log.print( + "Using file pattern: " + args.file_pattern + ", first file: " + files.front() + ); + log.print("Rank " + std::to_string(comm->rank()) + " reading: " + my_file); + + // Read Parquet into cudf::table + cudf::io::parquet_reader_options reader_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{my_file}); + auto table_with_md = cudf::io::read_parquet(reader_opts); + auto& table = table_with_md.tbl; + + // Pack per mode + auto packed = make_packed_items(*table, args.pack_mode, stream, &br); + + // Prepare codec + auto codec = make_codec(args.algo, args.params); + + // Runs + std::vector compress_t, decompress_t, comp_send_t, recv_decomp_t, send_t, + recv_t; + compress_t.reserve(args.num_runs); + decompress_t.reserve(args.num_runs); + comp_send_t.reserve(args.num_runs); + recv_decomp_t.reserve(args.num_runs); + send_t.reserve(args.num_runs); + recv_t.reserve(args.num_runs); + + std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops; + + for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) { + if (i == args.num_warmups + args.num_runs - 1) { + stats = std::make_shared(/* enable = */ true); + } + auto rr = run_once(comm, args, stream, &br, stats, packed, *codec); + + double cBps = static_cast(rr.counts.logical_uncompressed_bytes) + / rr.times.compress_s; + double dBps = static_cast(rr.counts.logical_uncompressed_bytes) + / rr.times.decompress_s; + double csBps = static_cast(rr.counts.logical_uncompressed_bytes) + / rr.times.comp_send_s; + double rdBps = static_cast(rr.counts.logical_uncompressed_bytes) + / rr.times.recv_decomp_s; + double sBps = static_cast(rr.counts.logical_uncompressed_bytes) + / rr.times.send_only_s; + double rBps = static_cast(rr.counts.logical_uncompressed_bytes) + / rr.times.recv_only_s; + + std::stringstream ss; + ss << "compress: " << format_nbytes(cBps) + << "/s | decompress: " << format_nbytes(dBps) + << "/s | comp+send: " << format_nbytes(csBps) + << "/s | recv+decomp: " << format_nbytes(rdBps) + << "/s | send-only: " << format_nbytes(sBps) + << "/s | recv-only: " << format_nbytes(rBps) << "/s"; + if (i < args.num_warmups) + ss << " (warmup run)"; + log.print(ss.str()); + + if (i >= args.num_warmups) { + compress_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / cBps + ); + decompress_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / dBps + ); + comp_send_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / csBps + ); + recv_decomp_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / rdBps + ); + send_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / sBps + ); + recv_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / rBps + ); + } + } + + // Means + auto harmonic_mean = [](std::vector const& v) { + double denom_sum = 0.0; + for (auto x : v) + denom_sum += 1.0 / x; + return static_cast(v.size()) / denom_sum; + }; + + if (!compress_t.empty()) { + double mean_elapsed_c = harmonic_mean(compress_t); + double mean_elapsed_d = harmonic_mean(decompress_t); + double mean_elapsed_cs = harmonic_mean(comp_send_t); + double mean_elapsed_rd = harmonic_mean(recv_decomp_t); + double mean_elapsed_s = harmonic_mean(send_t); + double mean_elapsed_r = harmonic_mean(recv_t); + + std::stringstream ss; + ss << "means: compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s" + << " | decompress: " << format_nbytes(logical_bytes / mean_elapsed_d) << "/s" + << " | comp+send: " << format_nbytes(logical_bytes / mean_elapsed_cs) << "/s" + << " | recv+decomp: " << format_nbytes(logical_bytes / mean_elapsed_rd) << "/s" + << " | send-only: " << format_nbytes(logical_bytes / mean_elapsed_s) << "/s" + << " | recv-only: " << format_nbytes(logical_bytes / mean_elapsed_r) << "/s"; + log.print(ss.str()); + } + +#ifdef RAPIDSMPF_HAVE_CUPTI + if (args.enable_cupti_monitoring && cupti_monitor) { + cupti_monitor->stop_monitoring(); + std::string csv_filename = + args.cupti_csv_prefix + std::to_string(comm->rank()) + ".csv"; + try { + cupti_monitor->write_csv(csv_filename); + log.print( + "CUPTI memory data written to " + csv_filename + " (" + + std::to_string(cupti_monitor->get_sample_count()) + " samples, " + + std::to_string(cupti_monitor->get_total_callback_count()) + + " callbacks)" + ); + if (comm->rank() == 0) { + log.print( + "CUPTI Callback Summary:\n" + cupti_monitor->get_callback_summary() + ); + } + } catch (std::exception const& e) { + log.print("Failed to write CUPTI CSV file: " + std::string(e.what())); + } + } +#endif + + if (!use_bootstrap) { + RAPIDSMPF_MPI(MPI_Finalize()); + } + return 0; +} From d0575cbd72ae20ec0cbb2bb2e422c9ef9dac8b7a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 6 Nov 2025 07:49:13 -0800 Subject: [PATCH 02/26] Fixes --- cpp/benchmarks/bench_comp_comm.cpp | 109 +++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 31 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 6cc557f32..e2e0769ca 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -197,7 +197,9 @@ class LZ4Codec final : public NvcompCodec { explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {} std::size_t get_max_compressed_bytes(std::size_t in_bytes) override { - nvcomp::LZ4Manager mgr{static_cast(chunk_size_), 0}; + nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; + nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, 0}; auto cfg = mgr.configure_compression(in_bytes); return cfg.max_compressed_buffer_size; } @@ -209,12 +211,16 @@ class LZ4Codec final : public NvcompCodec { std::size_t* out_bytes, rmm::cuda_stream_view stream ) override { - nvcomp::LZ4Manager mgr{static_cast(chunk_size_), 0, stream.value()}; + nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; + nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - mgr.compress(d_out, d_in, cfg); - // Compressed size is stored at the beginning of output; ask manager - auto info = mgr.get_compress_result(d_out); - *out_bytes = info.compressed_bytes; + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + out_bytes + ); } void decompress( @@ -224,10 +230,15 @@ class LZ4Codec final : public NvcompCodec { std::size_t out_bytes, rmm::cuda_stream_view stream ) override { - nvcomp::LZ4Manager mgr{static_cast(chunk_size_), 0, stream.value()}; - auto cfg = mgr.configure_decompression(d_in, in_bytes); - (void)out_bytes; // decomp size implied by cfg - mgr.decompress(d_out, d_in, cfg); + (void)out_bytes; + nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; + nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; + const uint8_t* in_ptrs[1] = {static_cast(d_in)}; + size_t in_sizes[1] = {in_bytes}; + auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); + uint8_t* out_ptrs[1] = {static_cast(d_out)}; + mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); } private: @@ -237,10 +248,16 @@ class LZ4Codec final : public NvcompCodec { class CascadedCodec final : public NvcompCodec { public: CascadedCodec(std::size_t chunk_size, int rle, int delta, int bitpack) - : opts_{rle != 0, delta != 0, bitpack != 0, static_cast(chunk_size)} {} + : chunk_size_{chunk_size} { + copts_ = nvcompBatchedCascadedCompressDefaultOpts; + copts_.num_RLEs = rle ? 1 : 0; + copts_.num_deltas = delta ? 1 : 0; + copts_.use_bp = bitpack ? 1 : 0; + dopts_ = nvcompBatchedCascadedDecompressDefaultOpts; + } std::size_t get_max_compressed_bytes(std::size_t in_bytes) override { - nvcomp::CascadedManager mgr{opts_}; + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, 0}; auto cfg = mgr.configure_compression(in_bytes); return cfg.max_compressed_buffer_size; } @@ -252,11 +269,14 @@ class CascadedCodec final : public NvcompCodec { std::size_t* out_bytes, rmm::cuda_stream_view stream ) override { - nvcomp::CascadedManager mgr{opts_, stream.value()}; + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - mgr.compress(d_out, d_in, cfg); - auto info = mgr.get_compress_result(d_out); - *out_bytes = info.compressed_bytes; + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + out_bytes + ); } void decompress( @@ -266,14 +286,19 @@ class CascadedCodec final : public NvcompCodec { std::size_t out_bytes, rmm::cuda_stream_view stream ) override { - nvcomp::CascadedManager mgr{opts_, stream.value()}; - auto cfg = mgr.configure_decompression(d_in, in_bytes); (void)out_bytes; - mgr.decompress(d_out, d_in, cfg); + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; + const uint8_t* in_ptrs[1] = {static_cast(d_in)}; + size_t in_sizes[1] = {in_bytes}; + auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); + uint8_t* out_ptrs[1] = {static_cast(d_out)}; + mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); } private: - nvcomp::CascadedOptions opts_{}; + std::size_t chunk_size_{}; + nvcompBatchedCascadedCompressOpts_t copts_{}; + nvcompBatchedCascadedDecompressOpts_t dopts_{}; }; std::unique_ptr make_codec(Algo algo, KvParams const& p) { @@ -293,12 +318,9 @@ static std::unique_ptr pack_table_to_packed( cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br ) { auto packed = cudf::pack(tv, stream, br->device_mr()); - auto metadata = - std::make_unique>(std::move(packed.metadata)); - auto buf = br->move( - std::make_unique(std::move(packed.gpu_data)), stream + return std::make_unique( + std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream) ); - return std::make_unique(std::move(metadata), std::move(buf)); } struct ArgumentParser { @@ -498,6 +520,7 @@ RunResult run_once( BuffersToSend const& data, NvcompCodec& codec ) { + (void)statistics; auto const nranks = comm->nranks(); auto const rank = comm->rank(); auto const dst = static_cast((rank + 1) % nranks); @@ -535,10 +558,23 @@ RunResult run_once( auto t0 = Clock::now(); // Compress all items (single batch) on stream for (std::size_t i = 0; i < data.items.size(); ++i) { - void const* d_in = data.items[i].packed->data->device()->get()->data(); - void* d_out = comp_outputs[i]->device()->get()->data(); std::size_t out_bytes = 0; - codec.compress(d_in, data.items[i].packed->data->size, d_out, &out_bytes, stream); + // Use exclusive access to fetch pointers and call codec on the provided stream + auto* in_buf = data.items[i].packed->data.get(); + auto* out_buf = comp_outputs[i].get(); + in_buf->stream().synchronize(); + out_buf->stream().synchronize(); + auto* in_raw = data.items[i].packed->data->exclusive_data_access(); + auto* out_raw = comp_outputs[i]->exclusive_data_access(); + codec.compress( + static_cast(in_raw), + data.items[i].packed->data->size, + static_cast(out_raw), + &out_bytes, + stream + ); + comp_outputs[i]->unlock(); + data.items[i].packed->data->unlock(); comp_output_sizes[i] = out_bytes; } RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); @@ -659,9 +695,20 @@ RunResult run_once( auto const out_bytes = data.items[i].packed->data->size; auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE); auto out = br->allocate(out_bytes, stream, res); - void const* d_in = comp_outputs[i]->device()->get()->data(); - void* d_out = out->device()->get()->data(); - codec.decompress(d_in, comp_output_sizes[i], d_out, out_bytes, stream); + // Synchronize prior to exclusive access + comp_outputs[i]->stream().synchronize(); + out->stream().synchronize(); + auto* in_raw = comp_outputs[i]->exclusive_data_access(); + auto* out_raw = out->exclusive_data_access(); + codec.decompress( + static_cast(in_raw), + comp_output_sizes[i], + static_cast(out_raw), + out_bytes, + stream + ); + out->unlock(); + comp_outputs[i]->unlock(); } RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto c1 = Clock::now(); From 5a677f9cc8d10d9015b30d9726806b5e9a957030 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 6 Nov 2025 08:06:54 -0800 Subject: [PATCH 03/26] Fix illegal memory access --- cpp/benchmarks/bench_comp_comm.cpp | 112 +++++++++++++++++++---------- 1 file changed, 74 insertions(+), 38 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index e2e0769ca..5f88bc5eb 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -175,7 +175,9 @@ struct PhaseThroughputs { class NvcompCodec { public: virtual ~NvcompCodec() = default; - virtual std::size_t get_max_compressed_bytes(std::size_t uncompressed_bytes) = 0; + virtual std::size_t get_max_compressed_bytes( + std::size_t uncompressed_bytes, rmm::cuda_stream_view stream + ) = 0; virtual void compress( void const* d_in, std::size_t in_bytes, @@ -196,10 +198,12 @@ class LZ4Codec final : public NvcompCodec { public: explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {} - std::size_t get_max_compressed_bytes(std::size_t in_bytes) override { + std::size_t get_max_compressed_bytes( + std::size_t in_bytes, rmm::cuda_stream_view stream + ) override { nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; - nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, 0}; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); return cfg.max_compressed_buffer_size; } @@ -215,12 +219,19 @@ class LZ4Codec final : public NvcompCodec { nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault + )); mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - out_bytes + pinned_bytes ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); } void decompress( @@ -256,8 +267,10 @@ class CascadedCodec final : public NvcompCodec { dopts_ = nvcompBatchedCascadedDecompressDefaultOpts; } - std::size_t get_max_compressed_bytes(std::size_t in_bytes) override { - nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, 0}; + std::size_t get_max_compressed_bytes( + std::size_t in_bytes, rmm::cuda_stream_view stream + ) override { + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); return cfg.max_compressed_buffer_size; } @@ -271,12 +284,19 @@ class CascadedCodec final : public NvcompCodec { ) override { nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault + )); mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - out_bytes + pinned_bytes ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); } void decompress( @@ -549,7 +569,8 @@ RunResult run_once( comp_outputs.reserve(data.items.size()); for (std::size_t i = 0; i < data.items.size(); ++i) { auto const in_bytes = data.items[i].packed->data->size; - auto const max_out = codec.get_max_compressed_bytes(in_bytes); + std::size_t const max_out = + (in_bytes == 0) ? 1 : codec.get_max_compressed_bytes(in_bytes, stream); auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE); comp_outputs.emplace_back(br->allocate(max_out, stream, reservation)); } @@ -558,24 +579,35 @@ RunResult run_once( auto t0 = Clock::now(); // Compress all items (single batch) on stream for (std::size_t i = 0; i < data.items.size(); ++i) { - std::size_t out_bytes = 0; - // Use exclusive access to fetch pointers and call codec on the provided stream - auto* in_buf = data.items[i].packed->data.get(); - auto* out_buf = comp_outputs[i].get(); - in_buf->stream().synchronize(); - out_buf->stream().synchronize(); - auto* in_raw = data.items[i].packed->data->exclusive_data_access(); - auto* out_raw = comp_outputs[i]->exclusive_data_access(); - codec.compress( - static_cast(in_raw), - data.items[i].packed->data->size, - static_cast(out_raw), - &out_bytes, - stream + auto const in_bytes = data.items[i].packed->data->size; + if (in_bytes == 0) { + comp_output_sizes[i] = 0; + continue; + } + // Ensure any prior writes to input are completed + data.items[i].packed->data->stream().synchronize(); + // Launch compression on the output buffer's stream and record an event after + comp_outputs[i]->write_access( + [&codec, &data, i, in_bytes, &comp_output_sizes, stream]( + std::byte* out_ptr, rmm::cuda_stream_view out_stream + ) { + (void)out_ptr; // pointer used below + // Lock input for raw pointer access + auto* in_raw = data.items[i].packed->data->exclusive_data_access(); + std::size_t out_bytes = 0; + codec.compress( + static_cast(in_raw), + in_bytes, + static_cast(out_ptr), + &out_bytes, + out_stream + ); + // Ensure comp_bytes is populated before returning + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(out_stream.value())); + data.items[i].packed->data->unlock(); + comp_output_sizes[i] = out_bytes; + } ); - comp_outputs[i]->unlock(); - data.items[i].packed->data->unlock(); - comp_output_sizes[i] = out_bytes; } RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto t1 = Clock::now(); @@ -693,22 +725,26 @@ RunResult run_once( auto c0 = Clock::now(); for (std::size_t i = 0; i < data.items.size(); ++i) { auto const out_bytes = data.items[i].packed->data->size; + if (out_bytes == 0) { + continue; + } auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE); auto out = br->allocate(out_bytes, stream, res); - // Synchronize prior to exclusive access + // Ensure compressed outputs are ready before using as input comp_outputs[i]->stream().synchronize(); - out->stream().synchronize(); - auto* in_raw = comp_outputs[i]->exclusive_data_access(); - auto* out_raw = out->exclusive_data_access(); - codec.decompress( - static_cast(in_raw), - comp_output_sizes[i], - static_cast(out_raw), - out_bytes, - stream - ); - out->unlock(); - comp_outputs[i]->unlock(); + out->write_access([&codec, &comp_outputs, &comp_output_sizes, i, out_bytes]( + std::byte* out_ptr, rmm::cuda_stream_view out_stream + ) { + auto* in_raw = comp_outputs[i]->exclusive_data_access(); + codec.decompress( + static_cast(in_raw), + comp_output_sizes[i], + static_cast(out_ptr), + out_bytes, + out_stream + ); + comp_outputs[i]->unlock(); + }); } RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto c1 = Clock::now(); From dd793fdae7dfe09943803ff844820693f56da1c1 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 6 Nov 2025 08:53:41 -0800 Subject: [PATCH 04/26] Wait for buffer before sending --- cpp/benchmarks/bench_comp_comm.cpp | 34 +++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 5f88bc5eb..a678796df 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -635,6 +635,9 @@ RunResult run_once( br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res); buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); + if (!send_buf->is_latest_write_done()) { + send_buf->stream().synchronize(); + } send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp)); } } @@ -677,6 +680,9 @@ RunResult run_once( SizeHeader h{static_cast(comp_output_sizes[i])}; std::memcpy(p, &h, sizeof(SizeHeader)); }); + if (!hdr->is_latest_write_done()) { + hdr->stream().synchronize(); + } send_hdr_futs.push_back(comm->send(std::move(hdr), dst, tag_size)); } } @@ -693,21 +699,29 @@ RunResult run_once( for (std::size_t i = 0; i < data.items.size(); ++i) { if (src != rank) { // reuse comp_output_sizes[i] as expected size since peers are symmetric - auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto buf = br->allocate(comp_output_sizes[i], stream, res); - recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf))); + if (comp_output_sizes[i] > 0) { + auto res = + br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto buf = br->allocate(comp_output_sizes[i], stream, res); + recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf))); + } } } for (std::size_t i = 0; i < data.items.size(); ++i) { if (dst != rank) { // send compressed - auto tmp_buf = - br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf); - buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - send_cmp_futs.push_back( - comm->send(std::move(send_buf), dst, tag_payload) - ); + if (comp_output_sizes[i] > 0) { + auto tmp_buf = + br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf); + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); + if (!send_buf->is_latest_write_done()) { + send_buf->stream().synchronize(); + } + send_cmp_futs.push_back( + comm->send(std::move(send_buf), dst, tag_payload) + ); + } } } } From 19c81c8e1dc0703d8489fd63ca16c416cb11247d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Thu, 6 Nov 2025 10:20:49 -0800 Subject: [PATCH 05/26] Improve measuring received messages --- cpp/benchmarks/bench_comp_comm.cpp | 40 +++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index a678796df..43d2eef63 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -476,7 +476,10 @@ struct PackedItem { struct BuffersToSend { // For each op, we will send these items std::vector items; + // Uncompressed bytes that represent the actual payload we transmit (device data only) std::size_t total_uncompressed_bytes{0}; + // Convenience: device payload bytes (same as total_uncompressed_bytes here) + std::size_t total_payload_bytes{0}; }; BuffersToSend make_packed_items( @@ -489,8 +492,8 @@ BuffersToSend make_packed_items( if (mode == PackMode::Table) { auto item = PackedItem{}; item.packed = pack_table_to_packed(table.view(), stream, br); - ret.total_uncompressed_bytes += - item.packed->data->size + item.packed->metadata->size(); + ret.total_uncompressed_bytes += item.packed->data->size; + ret.total_payload_bytes += item.packed->data->size; ret.items.emplace_back(std::move(item)); } else { auto tv = table.view(); @@ -498,8 +501,8 @@ BuffersToSend make_packed_items( cudf::table_view col_tv{std::vector{tv.column(i)}}; auto item = PackedItem{}; item.packed = pack_table_to_packed(col_tv, stream, br); - ret.total_uncompressed_bytes += - item.packed->data->size + item.packed->metadata->size(); + ret.total_uncompressed_bytes += item.packed->data->size; + ret.total_payload_bytes += item.packed->data->size; ret.items.emplace_back(std::move(item)); } } @@ -642,14 +645,23 @@ RunResult run_once( } } } - while (!send_futs.empty()) { - std::ignore = comm->test_some(send_futs); - } - auto a1 = Clock::now(); - while (!recv_futs.empty()) { - std::ignore = comm->test_some(recv_futs); + // Drive send/recv completion concurrently and timestamp each independently + std::optional send_done_tp; + std::optional recv_done_tp; + while (!send_futs.empty() || !recv_futs.empty()) { + if (!send_futs.empty()) { + std::ignore = comm->test_some(send_futs); + if (!send_done_tp && send_futs.empty()) + send_done_tp = Clock::now(); + } + if (!recv_futs.empty()) { + std::ignore = comm->test_some(recv_futs); + if (!recv_done_tp && recv_futs.empty()) + recv_done_tp = Clock::now(); + } } - auto a2 = Clock::now(); + auto a1 = send_done_tp.value_or(Clock::now()); + auto a2 = recv_done_tp.value_or(Clock::now()); // Phase B: compressed path (send size header, then compressed payload) auto b0 = Clock::now(); @@ -766,14 +778,14 @@ RunResult run_once( RunResult result{}; result.times.compress_s = std::chrono::duration(t1 - t0).count(); result.times.send_only_s = std::chrono::duration(a1 - a0).count(); - result.times.recv_only_s = std::chrono::duration(a2 - a1).count(); + result.times.recv_only_s = std::chrono::duration(a2 - a0).count(); result.times.comp_send_s = std::chrono::duration(b1 - b0).count(); result.times.recv_decomp_s = std::chrono::duration(b2 - b1).count() + std::chrono::duration(c1 - c0).count(); result.times.decompress_s = std::chrono::duration(c1 - c0).count(); - result.counts.logical_uncompressed_bytes = - data.total_uncompressed_bytes * args.num_ops; + // Use payload (device) bytes as the logical uncompressed size for throughput + result.counts.logical_uncompressed_bytes = data.total_payload_bytes * args.num_ops; result.counts.logical_compressed_bytes = std::accumulate( comp_output_sizes.begin(), comp_output_sizes.end(), std::size_t{0} From 802d95c3c8024b2a1efa4122a18402c19d369a7c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 7 Nov 2025 02:12:21 -0800 Subject: [PATCH 06/26] Rename to column --- cpp/benchmarks/bench_comp_comm.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 43d2eef63..a8718b308 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -57,7 +57,7 @@ namespace { enum class PackMode { Table, - Columns + Column }; enum class Algo { Cascaded, @@ -376,12 +376,11 @@ struct ArgumentParser { std::string v{optarg}; if (v == "table") args_.pack_mode = PackMode::Table; - else if (v == "columns") - args_.pack_mode = PackMode::Columns; + else if (v == "column") + args_.pack_mode = PackMode::Column; else RAPIDSMPF_FAIL( - "-P must be one of {table, columns}", - std::invalid_argument + "-P must be one of {table, column}", std::invalid_argument ); break; } @@ -420,7 +419,7 @@ struct ArgumentParser { << " -m RMM MR {cuda, pool, async, managed} " "(default: pool)\n" << " -F Parquet file glob/pattern (required)\n" - << " -P Packing mode {table, columns} (default: " + << " -P Packing mode {table, column} (default: " "table)\n" << " -A {cascaded, lz4} (default: cascaded)\n" << " -K Algo params, e.g. " @@ -641,6 +640,8 @@ RunResult run_once( if (!send_buf->is_latest_write_done()) { send_buf->stream().synchronize(); } + // std::cout << "Sending payload of size " << nocomp_payloads[i]->size << + // " to rank " << dst << std::endl; send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp)); } } From d3720d5f7515996376184971c3d88c5ea8592162 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 06:37:55 -0800 Subject: [PATCH 07/26] Use roundtrip transfers --- cpp/benchmarks/bench_comp_comm.cpp | 312 ++++++++++++++--------------- 1 file changed, 148 insertions(+), 164 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index a8718b308..b28b2b5fb 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -97,7 +97,6 @@ std::vector expand_glob(std::string const& pattern) { } } globfree(&glob_result); - std::sort(files.begin(), files.end()); return files; } @@ -516,10 +515,9 @@ struct SizeHeader { struct Timings { double compress_s{0.0}; double decompress_s{0.0}; - double comp_send_s{0.0}; - double recv_decomp_s{0.0}; - double send_only_s{0.0}; - double recv_only_s{0.0}; + // Round-trip totals measured at initiator + double rt_nocomp_s{0.0}; + double rt_comp_s{0.0}; }; // Returns timings and bytes counters @@ -540,7 +538,8 @@ RunResult run_once( BufferResource* br, std::shared_ptr const& statistics, BuffersToSend const& data, - NvcompCodec& codec + NvcompCodec& codec, + std::uint64_t run_index ) { (void)statistics; auto const nranks = comm->nranks(); @@ -548,9 +547,10 @@ RunResult run_once( auto const dst = static_cast((rank + 1) % nranks); auto const src = static_cast((rank - 1 + nranks) % nranks); - Tag tag_size{1, 0}; - Tag tag_payload{1, 1}; - Tag tag_nocomp{2, 0}; + Tag tag_ping_nc{10, 0}; + Tag tag_pong_nc{10, 1}; + Tag tag_ping_c{11, 0}; + Tag tag_pong_c{11, 1}; // Clone packed items into raw device buffers for repeated ops std::vector> nocomp_payloads; @@ -614,138 +614,138 @@ RunResult run_once( RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto t1 = Clock::now(); - // Phase A: pure send/recv (no compression) - auto a0 = Clock::now(); - std::vector> send_futs; - std::vector> recv_futs; - send_futs.reserve(args.num_ops * nocomp_payloads.size()); - recv_futs.reserve(args.num_ops * nocomp_payloads.size()); - + // Phase A (RTT no compression): ping-pong per op + Duration rt_nc_total{0}; for (std::uint64_t op = 0; op < args.num_ops; ++op) { - for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - // post recv first - if (src != rank) { + bool initiator = + ((static_cast(rank) + op + run_index) % 2ull) == 0ull; + auto rt_start = Clock::now(); + if (initiator) { + // Initiator: post pong recvs, then ping sends + std::vector> pong_recvs; + std::vector> ping_sends; + pong_recvs.reserve(nocomp_payloads.size()); + ping_sends.reserve(nocomp_payloads.size()); + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { auto res = br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res); - recv_futs.push_back(comm->recv(src, tag_nocomp, std::move(recv_buf))); + pong_recvs.push_back(comm->recv(src, tag_pong_nc, std::move(recv_buf))); } - } - for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - if (dst != rank) { + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { auto res = br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res); buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); - if (!send_buf->is_latest_write_done()) { + if (!send_buf->is_latest_write_done()) send_buf->stream().synchronize(); - } - // std::cout << "Sending payload of size " << nocomp_payloads[i]->size << - // " to rank " << dst << std::endl; - send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp)); + ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc)); } - } - } - // Drive send/recv completion concurrently and timestamp each independently - std::optional send_done_tp; - std::optional recv_done_tp; - while (!send_futs.empty() || !recv_futs.empty()) { - if (!send_futs.empty()) { - std::ignore = comm->test_some(send_futs); - if (!send_done_tp && send_futs.empty()) - send_done_tp = Clock::now(); - } - if (!recv_futs.empty()) { - std::ignore = comm->test_some(recv_futs); - if (!recv_done_tp && recv_futs.empty()) - recv_done_tp = Clock::now(); - } - } - auto a1 = send_done_tp.value_or(Clock::now()); - auto a2 = recv_done_tp.value_or(Clock::now()); - - // Phase B: compressed path (send size header, then compressed payload) - auto b0 = Clock::now(); - std::vector> send_hdr_futs; - std::vector> send_cmp_futs; - std::vector> recv_hdr_futs; - std::vector> recv_cmp_futs; - send_hdr_futs.reserve(args.num_ops * data.items.size()); - send_cmp_futs.reserve(args.num_ops * data.items.size()); - recv_hdr_futs.reserve(args.num_ops * data.items.size()); - recv_cmp_futs.reserve(args.num_ops * data.items.size()); - - for (std::uint64_t op = 0; op < args.num_ops; ++op) { - for (std::size_t i = 0; i < data.items.size(); ++i) { - // post recv header - if (src != rank) { - auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST); - auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h); - recv_hdr_futs.push_back(comm->recv(src, tag_size, std::move(hdr))); + while (!ping_sends.empty()) { + std::ignore = comm->test_some(ping_sends); } - } - for (std::size_t i = 0; i < data.items.size(); ++i) { - if (dst != rank) { - auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST); - auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h); - // write header - hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { - SizeHeader h{static_cast(comp_output_sizes[i])}; - std::memcpy(p, &h, sizeof(SizeHeader)); - }); - if (!hdr->is_latest_write_done()) { - hdr->stream().synchronize(); - } - send_hdr_futs.push_back(comm->send(std::move(hdr), dst, tag_size)); + while (!pong_recvs.empty()) { + std::ignore = comm->test_some(pong_recvs); + } + } else { + // Responder: post ping recvs, then pong sends + std::vector> ping_recvs; + std::vector> pong_sends; + ping_recvs.reserve(nocomp_payloads.size()); + pong_sends.reserve(nocomp_payloads.size()); + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { + auto res = + br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); + auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res); + ping_recvs.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf))); + } + while (!ping_recvs.empty()) { + std::ignore = comm->test_some(ping_recvs); + } + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { + auto res = + br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); + auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res); + buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); + if (!send_buf->is_latest_write_done()) + send_buf->stream().synchronize(); + pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_nc)); + } + while (!pong_sends.empty()) { + std::ignore = comm->test_some(pong_sends); } } - } - while (!send_hdr_futs.empty()) { - std::ignore = comm->test_some(send_hdr_futs); - } - while (!recv_hdr_futs.empty()) { - std::ignore = comm->test_some(recv_hdr_futs); + auto rt_end = Clock::now(); + // Each rank measures its own RTT locally + rt_nc_total += (rt_end - rt_start); } - // Post payload recvs now that we know sizes + // Phase B (RTT compressed payload only): ping-pong of compressed buffers (no headers) + Duration rt_c_total{0}; for (std::uint64_t op = 0; op < args.num_ops; ++op) { - for (std::size_t i = 0; i < data.items.size(); ++i) { - if (src != rank) { - // reuse comp_output_sizes[i] as expected size since peers are symmetric - if (comp_output_sizes[i] > 0) { - auto res = - br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto buf = br->allocate(comp_output_sizes[i], stream, res); - recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf))); - } + bool initiator = + ((static_cast(rank) + op + run_index) % 2ull) == 0ull; + auto rt_start = Clock::now(); + if (initiator) { + std::vector> pong_recvs; + std::vector> ping_sends; + pong_recvs.reserve(data.items.size()); + ping_sends.reserve(data.items.size()); + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (comp_output_sizes[i] == 0) + continue; + auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto recv_buf = br->allocate(comp_output_sizes[i], stream, res); + pong_recvs.push_back(comm->recv(src, tag_pong_c, std::move(recv_buf))); } - } - for (std::size_t i = 0; i < data.items.size(); ++i) { - if (dst != rank) { - // send compressed - if (comp_output_sizes[i] > 0) { - auto tmp_buf = - br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf); - buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - if (!send_buf->is_latest_write_done()) { - send_buf->stream().synchronize(); - } - send_cmp_futs.push_back( - comm->send(std::move(send_buf), dst, tag_payload) - ); - } + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (comp_output_sizes[i] == 0) + continue; + auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto send_buf = br->allocate(comp_output_sizes[i], stream, res); + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); + if (!send_buf->is_latest_write_done()) + send_buf->stream().synchronize(); + ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_c)); + } + while (!ping_sends.empty()) { + std::ignore = comm->test_some(ping_sends); + } + while (!pong_recvs.empty()) { + std::ignore = comm->test_some(pong_recvs); + } + } else { + std::vector> ping_recvs; + std::vector> pong_sends; + ping_recvs.reserve(data.items.size()); + pong_sends.reserve(data.items.size()); + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (comp_output_sizes[i] == 0) + continue; + auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto recv_buf = br->allocate(comp_output_sizes[i], stream, res); + ping_recvs.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf))); + } + while (!ping_recvs.empty()) { + std::ignore = comm->test_some(ping_recvs); + } + for (std::size_t i = 0; i < data.items.size(); ++i) { + if (comp_output_sizes[i] == 0) + continue; + auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); + auto send_buf = br->allocate(comp_output_sizes[i], stream, res); + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); + if (!send_buf->is_latest_write_done()) + send_buf->stream().synchronize(); + pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_c)); + } + while (!pong_sends.empty()) { + std::ignore = comm->test_some(pong_sends); } } + auto rt_end = Clock::now(); + rt_c_total += (rt_end - rt_start); } - while (!send_cmp_futs.empty()) { - std::ignore = comm->test_some(send_cmp_futs); - } - auto b1 = Clock::now(); - while (!recv_cmp_futs.empty()) { - std::ignore = comm->test_some(recv_cmp_futs); - } - auto b2 = Clock::now(); // Decompress received buffers (simulate by decompressing our own produced outputs in // symmetric setup) @@ -778,11 +778,8 @@ RunResult run_once( RunResult result{}; result.times.compress_s = std::chrono::duration(t1 - t0).count(); - result.times.send_only_s = std::chrono::duration(a1 - a0).count(); - result.times.recv_only_s = std::chrono::duration(a2 - a0).count(); - result.times.comp_send_s = std::chrono::duration(b1 - b0).count(); - result.times.recv_decomp_s = std::chrono::duration(b2 - b1).count() - + std::chrono::duration(c1 - c0).count(); + result.times.rt_nocomp_s = rt_nc_total.count(); + result.times.rt_comp_s = rt_c_total.count(); result.times.decompress_s = std::chrono::duration(c1 - c0).count(); // Use payload (device) bytes as the logical uncompressed size for throughput @@ -906,14 +903,11 @@ int main(int argc, char** argv) { auto codec = make_codec(args.algo, args.params); // Runs - std::vector compress_t, decompress_t, comp_send_t, recv_decomp_t, send_t, - recv_t; + std::vector compress_t, decompress_t, rt_nc_t, rt_c_t; compress_t.reserve(args.num_runs); decompress_t.reserve(args.num_runs); - comp_send_t.reserve(args.num_runs); - recv_decomp_t.reserve(args.num_runs); - send_t.reserve(args.num_runs); - recv_t.reserve(args.num_runs); + rt_nc_t.reserve(args.num_runs); + rt_c_t.reserve(args.num_runs); std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops; @@ -921,28 +915,29 @@ int main(int argc, char** argv) { if (i == args.num_warmups + args.num_runs - 1) { stats = std::make_shared(/* enable = */ true); } - auto rr = run_once(comm, args, stream, &br, stats, packed, *codec); + auto rr = run_once(comm, args, stream, &br, stats, packed, *codec, i); double cBps = static_cast(rr.counts.logical_uncompressed_bytes) / rr.times.compress_s; double dBps = static_cast(rr.counts.logical_uncompressed_bytes) / rr.times.decompress_s; - double csBps = static_cast(rr.counts.logical_uncompressed_bytes) - / rr.times.comp_send_s; - double rdBps = static_cast(rr.counts.logical_uncompressed_bytes) - / rr.times.recv_decomp_s; - double sBps = static_cast(rr.counts.logical_uncompressed_bytes) - / rr.times.send_only_s; - double rBps = static_cast(rr.counts.logical_uncompressed_bytes) - / rr.times.recv_only_s; + // Round-trip one-way throughput: 2 * bytes_one_way / RTT + double rt_nc_Bps = + rr.times.rt_nocomp_s > 0.0 + ? (2.0 * static_cast(rr.counts.logical_uncompressed_bytes)) + / rr.times.rt_nocomp_s + : 0.0; + double rt_c_Bps = + rr.times.rt_comp_s > 0.0 + ? (2.0 * static_cast(rr.counts.logical_uncompressed_bytes)) + / rr.times.rt_comp_s + : 0.0; std::stringstream ss; ss << "compress: " << format_nbytes(cBps) << "/s | decompress: " << format_nbytes(dBps) - << "/s | comp+send: " << format_nbytes(csBps) - << "/s | recv+decomp: " << format_nbytes(rdBps) - << "/s | send-only: " << format_nbytes(sBps) - << "/s | recv-only: " << format_nbytes(rBps) << "/s"; + << "/s | rt(nocomp): " << format_nbytes(rt_nc_Bps) + << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s"; if (i < args.num_warmups) ss << " (warmup run)"; log.print(ss.str()); @@ -954,18 +949,8 @@ int main(int argc, char** argv) { decompress_t.push_back( static_cast(rr.counts.logical_uncompressed_bytes) / dBps ); - comp_send_t.push_back( - static_cast(rr.counts.logical_uncompressed_bytes) / csBps - ); - recv_decomp_t.push_back( - static_cast(rr.counts.logical_uncompressed_bytes) / rdBps - ); - send_t.push_back( - static_cast(rr.counts.logical_uncompressed_bytes) / sBps - ); - recv_t.push_back( - static_cast(rr.counts.logical_uncompressed_bytes) / rBps - ); + rt_nc_t.push_back(rr.times.rt_nocomp_s); + rt_c_t.push_back(rr.times.rt_comp_s); } } @@ -980,18 +965,17 @@ int main(int argc, char** argv) { if (!compress_t.empty()) { double mean_elapsed_c = harmonic_mean(compress_t); double mean_elapsed_d = harmonic_mean(decompress_t); - double mean_elapsed_cs = harmonic_mean(comp_send_t); - double mean_elapsed_rd = harmonic_mean(recv_decomp_t); - double mean_elapsed_s = harmonic_mean(send_t); - double mean_elapsed_r = harmonic_mean(recv_t); + double mean_rt_nc = harmonic_mean(rt_nc_t); + double mean_rt_c = harmonic_mean(rt_c_t); std::stringstream ss; ss << "means: compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s" << " | decompress: " << format_nbytes(logical_bytes / mean_elapsed_d) << "/s" - << " | comp+send: " << format_nbytes(logical_bytes / mean_elapsed_cs) << "/s" - << " | recv+decomp: " << format_nbytes(logical_bytes / mean_elapsed_rd) << "/s" - << " | send-only: " << format_nbytes(logical_bytes / mean_elapsed_s) << "/s" - << " | recv-only: " << format_nbytes(logical_bytes / mean_elapsed_r) << "/s"; + << " | rt(nocomp): " + << format_nbytes((2.0 * static_cast(logical_bytes)) / mean_rt_nc) + << "/s | rt(comp): " + << format_nbytes((2.0 * static_cast(logical_bytes)) / mean_rt_c) + << "/s"; log.print(ss.str()); } From 99d3ba94ddcab0d32ea2369eeb50d19d3d98d038 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 08:07:04 -0800 Subject: [PATCH 08/26] Fix roundtrip on >2 ranks --- cpp/benchmarks/bench_comp_comm.cpp | 217 +++++++++++++++++------------ 1 file changed, 127 insertions(+), 90 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index b28b2b5fb..6ff524ddc 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -614,65 +614,54 @@ RunResult run_once( RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto t1 = Clock::now(); - // Phase A (RTT no compression): ping-pong per op + // Phase A (RTT no compression): ping-pong per op (sequential per item to avoid + // deadlocks) Duration rt_nc_total{0}; for (std::uint64_t op = 0; op < args.num_ops; ++op) { bool initiator = ((static_cast(rank) + op + run_index) % 2ull) == 0ull; auto rt_start = Clock::now(); if (initiator) { - // Initiator: post pong recvs, then ping sends - std::vector> pong_recvs; - std::vector> ping_sends; - pong_recvs.reserve(nocomp_payloads.size()); - ping_sends.reserve(nocomp_payloads.size()); for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto res = + // post pong recv and send ping, then wait both + auto res_r = br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res); - pong_recvs.push_back(comm->recv(src, tag_pong_nc, std::move(recv_buf))); - } - for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto res = + auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r); + std::vector> futs; + futs.push_back(comm->recv(dst, tag_pong_nc, std::move(recv_buf))); + auto res_s = br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res); + auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s); buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); if (!send_buf->is_latest_write_done()) send_buf->stream().synchronize(); - ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc)); - } - while (!ping_sends.empty()) { - std::ignore = comm->test_some(ping_sends); - } - while (!pong_recvs.empty()) { - std::ignore = comm->test_some(pong_recvs); + futs.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc)); + while (!futs.empty()) { + std::ignore = comm->test_some(futs); + } } } else { - // Responder: post ping recvs, then pong sends - std::vector> ping_recvs; - std::vector> pong_sends; - ping_recvs.reserve(nocomp_payloads.size()); - pong_sends.reserve(nocomp_payloads.size()); + // Responder: for each item, recv ping then send pong for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto res = + auto res_r = br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res); - ping_recvs.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf))); - } - while (!ping_recvs.empty()) { - std::ignore = comm->test_some(ping_recvs); - } - for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto res = + auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r); + std::vector> rf; + rf.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf))); + while (!rf.empty()) { + std::ignore = comm->test_some(rf); + } + auto res_s = br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res); + auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s); buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); if (!send_buf->is_latest_write_done()) send_buf->stream().synchronize(); - pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_nc)); - } - while (!pong_sends.empty()) { - std::ignore = comm->test_some(pong_sends); + std::vector> sf; + sf.push_back(comm->send(std::move(send_buf), src, tag_pong_nc)); + while (!sf.empty()) { + std::ignore = comm->test_some(sf); + } } } auto rt_end = Clock::now(); @@ -680,67 +669,115 @@ RunResult run_once( rt_nc_total += (rt_end - rt_start); } - // Phase B (RTT compressed payload only): ping-pong of compressed buffers (no headers) + // Phase B (RTT compressed payload only): ping-pong with size headers per item Duration rt_c_total{0}; for (std::uint64_t op = 0; op < args.num_ops; ++op) { bool initiator = ((static_cast(rank) + op + run_index) % 2ull) == 0ull; auto rt_start = Clock::now(); if (initiator) { - std::vector> pong_recvs; - std::vector> ping_sends; - pong_recvs.reserve(data.items.size()); - ping_sends.reserve(data.items.size()); for (std::size_t i = 0; i < data.items.size(); ++i) { - if (comp_output_sizes[i] == 0) - continue; - auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto recv_buf = br->allocate(comp_output_sizes[i], stream, res); - pong_recvs.push_back(comm->recv(src, tag_pong_c, std::move(recv_buf))); - } - for (std::size_t i = 0; i < data.items.size(); ++i) { - if (comp_output_sizes[i] == 0) - continue; - auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto send_buf = br->allocate(comp_output_sizes[i], stream, res); - buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - if (!send_buf->is_latest_write_done()) - send_buf->stream().synchronize(); - ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_c)); - } - while (!ping_sends.empty()) { - std::ignore = comm->test_some(ping_sends); - } - while (!pong_recvs.empty()) { - std::ignore = comm->test_some(pong_recvs); + // Send header with size to dst + std::uint64_t sz = static_cast(comp_output_sizes[i]); + auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h); + hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { + std::memcpy(p, &sz, sizeof(std::uint64_t)); + }); + if (!hdr->is_latest_write_done()) + hdr->stream().synchronize(); + std::vector> hf; + hf.push_back(comm->send(std::move(hdr), dst, tag_ping_c)); + while (!hf.empty()) { + std::ignore = comm->test_some(hf); + } + // Receive pong header with size from src (blocking wait) + auto res_hr = + br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr); + auto fut_hdr = comm->recv(dst, tag_pong_c, std::move(hdr_r)); + auto hdr_buf = comm->wait(std::move(fut_hdr)); + auto* p = hdr_buf->exclusive_data_access(); + std::uint64_t pong_sz = 0; + std::memcpy(&pong_sz, p, sizeof(std::uint64_t)); + hdr_buf->unlock(); + // Send ping payload (if any) + if (sz > 0) { + auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE); + auto send_buf = br->allocate(sz, stream, res_s); + if (comp_output_sizes[i] > 0) { + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); + } + if (!send_buf->is_latest_write_done()) + send_buf->stream().synchronize(); + std::vector> sf; + sf.push_back(comm->send(std::move(send_buf), dst, tag_ping_c)); + while (!sf.empty()) { + std::ignore = comm->test_some(sf); + } + } + // Receive pong payload of announced size + if (pong_sz > 0) { + auto res_r = br->reserve_or_fail(pong_sz, MemoryType::DEVICE); + auto recv_buf = br->allocate(pong_sz, stream, res_r); + std::vector> rf; + rf.push_back(comm->recv(dst, tag_pong_c, std::move(recv_buf))); + while (!rf.empty()) { + std::ignore = comm->test_some(rf); + } + } } } else { - std::vector> ping_recvs; - std::vector> pong_sends; - ping_recvs.reserve(data.items.size()); - pong_sends.reserve(data.items.size()); for (std::size_t i = 0; i < data.items.size(); ++i) { - if (comp_output_sizes[i] == 0) - continue; - auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto recv_buf = br->allocate(comp_output_sizes[i], stream, res); - ping_recvs.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf))); - } - while (!ping_recvs.empty()) { - std::ignore = comm->test_some(ping_recvs); - } - for (std::size_t i = 0; i < data.items.size(); ++i) { - if (comp_output_sizes[i] == 0) - continue; - auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE); - auto send_buf = br->allocate(comp_output_sizes[i], stream, res); - buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - if (!send_buf->is_latest_write_done()) - send_buf->stream().synchronize(); - pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_c)); - } - while (!pong_sends.empty()) { - std::ignore = comm->test_some(pong_sends); + // Receive ping header with size (blocking wait) + auto res_hr = + br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr); + auto fut_hdr = comm->recv(src, tag_ping_c, std::move(hdr_r)); + auto hdr_buf = comm->wait(std::move(fut_hdr)); + auto* p = hdr_buf->exclusive_data_access(); + std::uint64_t ping_sz = 0; + std::memcpy(&ping_sz, p, sizeof(std::uint64_t)); + hdr_buf->unlock(); + // Send pong header with our size + std::uint64_t sz = static_cast(comp_output_sizes[i]); + auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h); + hdr->write_access([&](std::byte* q, rmm::cuda_stream_view) { + std::memcpy(q, &sz, sizeof(std::uint64_t)); + }); + if (!hdr->is_latest_write_done()) + hdr->stream().synchronize(); + std::vector> hf; + hf.push_back(comm->send(std::move(hdr), src, tag_pong_c)); + while (!hf.empty()) { + std::ignore = comm->test_some(hf); + } + // Receive ping payload + if (ping_sz > 0) { + auto res_r = br->reserve_or_fail(ping_sz, MemoryType::DEVICE); + auto recv_buf = br->allocate(ping_sz, stream, res_r); + std::vector> rf; + rf.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf))); + while (!rf.empty()) { + std::ignore = comm->test_some(rf); + } + } + // Send pong payload + if (sz > 0) { + auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE); + auto send_buf = br->allocate(sz, stream, res_s); + if (comp_output_sizes[i] > 0) { + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); + } + if (!send_buf->is_latest_write_done()) + send_buf->stream().synchronize(); + std::vector> sf; + sf.push_back(comm->send(std::move(send_buf), src, tag_pong_c)); + while (!sf.empty()) { + std::ignore = comm->test_some(sf); + } + } } } auto rt_end = Clock::now(); From 99245f3bc42776520d61a0d73d0f22b2665fe8cf Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 10:17:23 -0800 Subject: [PATCH 09/26] Reduce code duplication --- cpp/benchmarks/bench_comp_comm.cpp | 203 +++++++++++++++-------------- 1 file changed, 107 insertions(+), 96 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 6ff524ddc..b4f90cd12 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -332,6 +332,93 @@ std::unique_ptr make_codec(Algo algo, KvParams const& p) { } } +static inline void ensure_ready(Buffer& buf) { + if (!buf.is_latest_write_done()) { + buf.stream().synchronize(); + } +} + +static inline std::unique_ptr alloc_device( + BufferResource* br, rmm::cuda_stream_view stream, std::size_t size +) { + auto res = br->reserve_or_fail(size, MemoryType::DEVICE); + return br->allocate(size, stream, res); +} + +static inline std::unique_ptr alloc_and_copy_device( + BufferResource* br, rmm::cuda_stream_view stream, Buffer const& src +) { + auto out = alloc_device(br, stream, src.size); + buffer_copy(*out, src, src.size); + return out; +} + +static inline void send_blocking( + std::shared_ptr const& comm, + std::unique_ptr buf, + Rank to, + Tag tag +) { + ensure_ready(*buf); + std::vector> futs; + futs.push_back(comm->send(std::move(buf), to, tag)); + while (!futs.empty()) { + std::ignore = comm->test_some(futs); + } +} + +static inline std::unique_ptr recv_blocking( + std::shared_ptr const& comm, + BufferResource* br, + rmm::cuda_stream_view stream, + Rank from, + Tag tag, + std::size_t size +) { + auto buf = alloc_device(br, stream, size); + auto fut = comm->recv(from, tag, std::move(buf)); + return comm->wait(std::move(fut)); +} + +static inline void send_u64_header_blocking( + std::shared_ptr const& comm, + BufferResource* br, + rmm::cuda_stream_view stream, + Rank to, + Tag tag, + std::uint64_t value +) { + auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto hdr = br->allocate(sizeof(std::uint64_t), stream, res); + hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { + std::memcpy(p, &value, sizeof(std::uint64_t)); + }); + ensure_ready(*hdr); + std::vector> futs; + futs.push_back(comm->send(std::move(hdr), to, tag)); + while (!futs.empty()) { + std::ignore = comm->test_some(futs); + } +} + +static inline std::uint64_t recv_u64_header_blocking( + std::shared_ptr const& comm, + BufferResource* br, + rmm::cuda_stream_view stream, + Rank from, + Tag tag +) { + auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto hdr = br->allocate(sizeof(std::uint64_t), stream, res); + auto fut = comm->recv(from, tag, std::move(hdr)); + auto buf = comm->wait(std::move(fut)); + auto* p = buf->exclusive_data_access(); + std::uint64_t value = 0; + std::memcpy(&value, p, sizeof(std::uint64_t)); + buf->unlock(); + return value; +} + // Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData static std::unique_ptr pack_table_to_packed( cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br @@ -624,17 +711,11 @@ RunResult run_once( if (initiator) { for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { // post pong recv and send ping, then wait both - auto res_r = - br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r); + auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size); std::vector> futs; futs.push_back(comm->recv(dst, tag_pong_nc, std::move(recv_buf))); - auto res_s = - br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s); - buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); - if (!send_buf->is_latest_write_done()) - send_buf->stream().synchronize(); + auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]); + ensure_ready(*send_buf); futs.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc)); while (!futs.empty()) { std::ignore = comm->test_some(futs); @@ -643,20 +724,14 @@ RunResult run_once( } else { // Responder: for each item, recv ping then send pong for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto res_r = - br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r); + auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size); std::vector> rf; rf.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf))); while (!rf.empty()) { std::ignore = comm->test_some(rf); } - auto res_s = - br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE); - auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s); - buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size); - if (!send_buf->is_latest_write_done()) - send_buf->stream().synchronize(); + auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]); + ensure_ready(*send_buf); std::vector> sf; sf.push_back(comm->send(std::move(send_buf), src, tag_pong_nc)); while (!sf.empty()) { @@ -679,104 +754,40 @@ RunResult run_once( for (std::size_t i = 0; i < data.items.size(); ++i) { // Send header with size to dst std::uint64_t sz = static_cast(comp_output_sizes[i]); - auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h); - hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { - std::memcpy(p, &sz, sizeof(std::uint64_t)); - }); - if (!hdr->is_latest_write_done()) - hdr->stream().synchronize(); - std::vector> hf; - hf.push_back(comm->send(std::move(hdr), dst, tag_ping_c)); - while (!hf.empty()) { - std::ignore = comm->test_some(hf); - } + send_u64_header_blocking(comm, br, stream, dst, tag_ping_c, sz); // Receive pong header with size from src (blocking wait) - auto res_hr = - br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr); - auto fut_hdr = comm->recv(dst, tag_pong_c, std::move(hdr_r)); - auto hdr_buf = comm->wait(std::move(fut_hdr)); - auto* p = hdr_buf->exclusive_data_access(); - std::uint64_t pong_sz = 0; - std::memcpy(&pong_sz, p, sizeof(std::uint64_t)); - hdr_buf->unlock(); + std::uint64_t pong_sz = + recv_u64_header_blocking(comm, br, stream, dst, tag_pong_c); // Send ping payload (if any) if (sz > 0) { - auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE); - auto send_buf = br->allocate(sz, stream, res_s); - if (comp_output_sizes[i] > 0) { + auto send_buf = alloc_device(br, stream, sz); + if (comp_output_sizes[i] > 0) buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - } - if (!send_buf->is_latest_write_done()) - send_buf->stream().synchronize(); - std::vector> sf; - sf.push_back(comm->send(std::move(send_buf), dst, tag_ping_c)); - while (!sf.empty()) { - std::ignore = comm->test_some(sf); - } + send_blocking(comm, std::move(send_buf), dst, tag_ping_c); } // Receive pong payload of announced size if (pong_sz > 0) { - auto res_r = br->reserve_or_fail(pong_sz, MemoryType::DEVICE); - auto recv_buf = br->allocate(pong_sz, stream, res_r); - std::vector> rf; - rf.push_back(comm->recv(dst, tag_pong_c, std::move(recv_buf))); - while (!rf.empty()) { - std::ignore = comm->test_some(rf); - } + (void)recv_blocking(comm, br, stream, dst, tag_pong_c, pong_sz); } } } else { for (std::size_t i = 0; i < data.items.size(); ++i) { // Receive ping header with size (blocking wait) - auto res_hr = - br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr); - auto fut_hdr = comm->recv(src, tag_ping_c, std::move(hdr_r)); - auto hdr_buf = comm->wait(std::move(fut_hdr)); - auto* p = hdr_buf->exclusive_data_access(); - std::uint64_t ping_sz = 0; - std::memcpy(&ping_sz, p, sizeof(std::uint64_t)); - hdr_buf->unlock(); + std::uint64_t ping_sz = + recv_u64_header_blocking(comm, br, stream, src, tag_ping_c); // Send pong header with our size std::uint64_t sz = static_cast(comp_output_sizes[i]); - auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h); - hdr->write_access([&](std::byte* q, rmm::cuda_stream_view) { - std::memcpy(q, &sz, sizeof(std::uint64_t)); - }); - if (!hdr->is_latest_write_done()) - hdr->stream().synchronize(); - std::vector> hf; - hf.push_back(comm->send(std::move(hdr), src, tag_pong_c)); - while (!hf.empty()) { - std::ignore = comm->test_some(hf); - } + send_u64_header_blocking(comm, br, stream, src, tag_pong_c, sz); // Receive ping payload if (ping_sz > 0) { - auto res_r = br->reserve_or_fail(ping_sz, MemoryType::DEVICE); - auto recv_buf = br->allocate(ping_sz, stream, res_r); - std::vector> rf; - rf.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf))); - while (!rf.empty()) { - std::ignore = comm->test_some(rf); - } + (void)recv_blocking(comm, br, stream, src, tag_ping_c, ping_sz); } // Send pong payload if (sz > 0) { - auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE); - auto send_buf = br->allocate(sz, stream, res_s); - if (comp_output_sizes[i] > 0) { + auto send_buf = alloc_device(br, stream, sz); + if (comp_output_sizes[i] > 0) buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - } - if (!send_buf->is_latest_write_done()) - send_buf->stream().synchronize(); - std::vector> sf; - sf.push_back(comm->send(std::move(send_buf), src, tag_pong_c)); - while (!sf.empty()) { - std::ignore = comm->test_some(sf); - } + send_blocking(comm, std::move(send_buf), src, tag_pong_c); } } } From 6c61c3637765080840b6cee22c1125380324a81e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 10:27:41 -0800 Subject: [PATCH 10/26] Make uncompressed transfer block identical --- cpp/benchmarks/bench_comp_comm.cpp | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index b4f90cd12..975ab2327 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -708,36 +708,23 @@ RunResult run_once( bool initiator = ((static_cast(rank) + op + run_index) % 2ull) == 0ull; auto rt_start = Clock::now(); - if (initiator) { + auto run_ping_pong_nc = [&](Rank peer, Tag recv_tag, Tag send_tag) { for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - // post pong recv and send ping, then wait both auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size); std::vector> futs; - futs.push_back(comm->recv(dst, tag_pong_nc, std::move(recv_buf))); + futs.push_back(comm->recv(peer, recv_tag, std::move(recv_buf))); auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]); ensure_ready(*send_buf); - futs.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc)); + futs.push_back(comm->send(std::move(send_buf), peer, send_tag)); while (!futs.empty()) { std::ignore = comm->test_some(futs); } } + }; + if (initiator) { + run_ping_pong_nc(dst, tag_pong_nc, tag_ping_nc); } else { - // Responder: for each item, recv ping then send pong - for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size); - std::vector> rf; - rf.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf))); - while (!rf.empty()) { - std::ignore = comm->test_some(rf); - } - auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]); - ensure_ready(*send_buf); - std::vector> sf; - sf.push_back(comm->send(std::move(send_buf), src, tag_pong_nc)); - while (!sf.empty()) { - std::ignore = comm->test_some(sf); - } - } + run_ping_pong_nc(src, tag_ping_nc, tag_pong_nc); } auto rt_end = Clock::now(); // Each rank measures its own RTT locally From 525541e528339d251a1cae2027d3e7dac5e8fe87 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 11:21:51 -0800 Subject: [PATCH 11/26] Implement functions to exchange header and payload --- cpp/benchmarks/bench_comp_comm.cpp | 201 +++++++++++++---------------- 1 file changed, 93 insertions(+), 108 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 975ab2327..b8684f25f 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -353,70 +353,68 @@ static inline std::unique_ptr alloc_and_copy_device( return out; } -static inline void send_blocking( - std::shared_ptr const& comm, - std::unique_ptr buf, - Rank to, - Tag tag -) { - ensure_ready(*buf); - std::vector> futs; - futs.push_back(comm->send(std::move(buf), to, tag)); - while (!futs.empty()) { - std::ignore = comm->test_some(futs); - } -} - -static inline std::unique_ptr recv_blocking( - std::shared_ptr const& comm, - BufferResource* br, - rmm::cuda_stream_view stream, - Rank from, - Tag tag, - std::size_t size -) { - auto buf = alloc_device(br, stream, size); - auto fut = comm->recv(from, tag, std::move(buf)); - return comm->wait(std::move(fut)); -} - -static inline void send_u64_header_blocking( +// Non-blocking helpers to exchange headers and payloads concurrently. +static inline std::uint64_t exchange_u64_header( std::shared_ptr const& comm, BufferResource* br, rmm::cuda_stream_view stream, - Rank to, - Tag tag, - std::uint64_t value + Rank peer, + Tag send_tag, + Tag recv_tag, + std::uint64_t send_value ) { - auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto hdr = br->allocate(sizeof(std::uint64_t), stream, res); - hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { - std::memcpy(p, &value, sizeof(std::uint64_t)); + // Post header send + auto send_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto send_hdr = br->allocate(sizeof(std::uint64_t), stream, send_hdr_res); + send_hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { + std::memcpy(p, &send_value, sizeof(std::uint64_t)); }); - ensure_ready(*hdr); - std::vector> futs; - futs.push_back(comm->send(std::move(hdr), to, tag)); - while (!futs.empty()) { - std::ignore = comm->test_some(futs); + ensure_ready(*send_hdr); + auto send_hdr_fut = comm->send(std::move(send_hdr), peer, send_tag); + // Post header recv + auto recv_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto recv_hdr = br->allocate(sizeof(std::uint64_t), stream, recv_hdr_res); + ensure_ready(*recv_hdr); + auto recv_hdr_fut = comm->recv(peer, recv_tag, std::move(recv_hdr)); + // Wait recv, read value, then ensure send completion + auto recv_hdr_buf = comm->wait(std::move(recv_hdr_fut)); + std::uint64_t recv_value = 0; + { + auto* p = recv_hdr_buf->exclusive_data_access(); + std::memcpy(&recv_value, p, sizeof(std::uint64_t)); + recv_hdr_buf->unlock(); } + std::ignore = comm->wait(std::move(send_hdr_fut)); + return recv_value; } -static inline std::uint64_t recv_u64_header_blocking( +static inline void exchange_payload( std::shared_ptr const& comm, BufferResource* br, rmm::cuda_stream_view stream, - Rank from, - Tag tag + Rank peer, + Tag send_tag, + Tag recv_tag, + std::unique_ptr send_buf, // may be null if no data to send + std::size_t recv_size // may be zero if no data to recv ) { - auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto hdr = br->allocate(sizeof(std::uint64_t), stream, res); - auto fut = comm->recv(from, tag, std::move(hdr)); - auto buf = comm->wait(std::move(fut)); - auto* p = buf->exclusive_data_access(); - std::uint64_t value = 0; - std::memcpy(&value, p, sizeof(std::uint64_t)); - buf->unlock(); - return value; + std::unique_ptr data_send_fut; + std::unique_ptr data_recv_fut; + if (recv_size > 0) { + auto recv_buf = alloc_device(br, stream, recv_size); + ensure_ready(*recv_buf); + data_recv_fut = comm->recv(peer, recv_tag, std::move(recv_buf)); + } + if (send_buf && send_buf->size > 0) { + ensure_ready(*send_buf); + data_send_fut = comm->send(std::move(send_buf), peer, send_tag); + } + if (data_recv_fut) { + (void)comm->wait(std::move(data_recv_fut)); + } + if (data_send_fut) { + std::ignore = comm->wait(std::move(data_send_fut)); + } } // Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData @@ -708,23 +706,23 @@ RunResult run_once( bool initiator = ((static_cast(rank) + op + run_index) % 2ull) == 0ull; auto rt_start = Clock::now(); - auto run_ping_pong_nc = [&](Rank peer, Tag recv_tag, Tag send_tag) { - for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size); - std::vector> futs; - futs.push_back(comm->recv(peer, recv_tag, std::move(recv_buf))); - auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]); - ensure_ready(*send_buf); - futs.push_back(comm->send(std::move(send_buf), peer, send_tag)); - while (!futs.empty()) { - std::ignore = comm->test_some(futs); - } - } - }; - if (initiator) { - run_ping_pong_nc(dst, tag_pong_nc, tag_ping_nc); - } else { - run_ping_pong_nc(src, tag_ping_nc, tag_pong_nc); + Rank peer = initiator ? dst : src; + Tag send_tag_nc = initiator ? tag_ping_nc : tag_pong_nc; + Tag recv_tag_nc = initiator ? tag_pong_nc : tag_ping_nc; + for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { + std::size_t recv_size = nocomp_payloads[i]->size; + std::unique_ptr send_buf = + alloc_and_copy_device(br, stream, *nocomp_payloads[i]); + exchange_payload( + comm, + br, + stream, + peer, + send_tag_nc, + recv_tag_nc, + std::move(send_buf), + recv_size + ); } auto rt_end = Clock::now(); // Each rank measures its own RTT locally @@ -737,46 +735,33 @@ RunResult run_once( bool initiator = ((static_cast(rank) + op + run_index) % 2ull) == 0ull; auto rt_start = Clock::now(); - if (initiator) { - for (std::size_t i = 0; i < data.items.size(); ++i) { - // Send header with size to dst - std::uint64_t sz = static_cast(comp_output_sizes[i]); - send_u64_header_blocking(comm, br, stream, dst, tag_ping_c, sz); - // Receive pong header with size from src (blocking wait) - std::uint64_t pong_sz = - recv_u64_header_blocking(comm, br, stream, dst, tag_pong_c); - // Send ping payload (if any) - if (sz > 0) { - auto send_buf = alloc_device(br, stream, sz); - if (comp_output_sizes[i] > 0) - buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - send_blocking(comm, std::move(send_buf), dst, tag_ping_c); - } - // Receive pong payload of announced size - if (pong_sz > 0) { - (void)recv_blocking(comm, br, stream, dst, tag_pong_c, pong_sz); - } - } - } else { - for (std::size_t i = 0; i < data.items.size(); ++i) { - // Receive ping header with size (blocking wait) - std::uint64_t ping_sz = - recv_u64_header_blocking(comm, br, stream, src, tag_ping_c); - // Send pong header with our size - std::uint64_t sz = static_cast(comp_output_sizes[i]); - send_u64_header_blocking(comm, br, stream, src, tag_pong_c, sz); - // Receive ping payload - if (ping_sz > 0) { - (void)recv_blocking(comm, br, stream, src, tag_ping_c, ping_sz); - } - // Send pong payload - if (sz > 0) { - auto send_buf = alloc_device(br, stream, sz); - if (comp_output_sizes[i] > 0) - buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); - send_blocking(comm, std::move(send_buf), src, tag_pong_c); - } + Rank peer = initiator ? dst : src; + Tag send_tag_c = initiator ? tag_ping_c : tag_pong_c; + Tag recv_tag_c = initiator ? tag_pong_c : tag_ping_c; + for (std::size_t i = 0; i < data.items.size(); ++i) { + // Header exchange: send our size, receive peer size + std::uint64_t local_sz = static_cast(comp_output_sizes[i]); + std::uint64_t remote_sz = exchange_u64_header( + comm, br, stream, peer, send_tag_c, recv_tag_c, local_sz + ); + // Prepare send buffer if needed + std::unique_ptr send_buf; + if (local_sz > 0) { + send_buf = alloc_device(br, stream, local_sz); + if (comp_output_sizes[i] > 0) + buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]); } + // Payload exchange using the same tags + exchange_payload( + comm, + br, + stream, + peer, + send_tag_c, + recv_tag_c, + std::move(send_buf), + static_cast(remote_sz) + ); } auto rt_end = Clock::now(); rt_c_total += (rt_end - rt_start); From fbec2a24a750a48b1c3f74c916df717bbf396fc9 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 11:40:30 -0800 Subject: [PATCH 12/26] Print compression ratio --- cpp/benchmarks/bench_comp_comm.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index b8684f25f..ce48b2dfe 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -930,6 +931,7 @@ int main(int argc, char** argv) { rt_c_t.reserve(args.num_runs); std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops; + std::size_t logical_compressed_bytes_last = 0; for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) { if (i == args.num_warmups + args.num_runs - 1) { @@ -941,6 +943,11 @@ int main(int argc, char** argv) { / rr.times.compress_s; double dBps = static_cast(rr.counts.logical_uncompressed_bytes) / rr.times.decompress_s; + logical_compressed_bytes_last = rr.counts.logical_compressed_bytes; + double ratio = (rr.counts.logical_compressed_bytes > 0) + ? static_cast(rr.counts.logical_uncompressed_bytes) + / static_cast(rr.counts.logical_compressed_bytes) + : 0.0; // Round-trip one-way throughput: 2 * bytes_one_way / RTT double rt_nc_Bps = rr.times.rt_nocomp_s > 0.0 @@ -957,7 +964,8 @@ int main(int argc, char** argv) { ss << "compress: " << format_nbytes(cBps) << "/s | decompress: " << format_nbytes(dBps) << "/s | rt(nocomp): " << format_nbytes(rt_nc_Bps) - << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s"; + << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s" + << " | comp ratio: " << std::fixed << std::setprecision(2) << ratio << "x"; if (i < args.num_warmups) ss << " (warmup run)"; log.print(ss.str()); @@ -996,6 +1004,14 @@ int main(int argc, char** argv) { << "/s | rt(comp): " << format_nbytes((2.0 * static_cast(logical_bytes)) / mean_rt_c) << "/s"; + if (logical_compressed_bytes_last > 0) { + double mean_ratio = static_cast(logical_bytes) + / static_cast(logical_compressed_bytes_last); + ss << " | comp ratio: " << std::fixed << std::setprecision(2) << mean_ratio + << "x"; + } else { + ss << " | comp ratio: n/a"; + } log.print(ss.str()); } From a1e79923fe29f6c376011f72cf0defdfad3a1463 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 10 Nov 2025 11:56:59 -0800 Subject: [PATCH 13/26] Compressed time includes compression/decompression --- cpp/benchmarks/bench_comp_comm.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index ce48b2dfe..26ffce259 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -954,10 +954,12 @@ int main(int argc, char** argv) { ? (2.0 * static_cast(rr.counts.logical_uncompressed_bytes)) / rr.times.rt_nocomp_s : 0.0; + double const rt_comp_total_s = + rr.times.compress_s + rr.times.rt_comp_s + rr.times.decompress_s; double rt_c_Bps = - rr.times.rt_comp_s > 0.0 + rt_comp_total_s > 0.0 ? (2.0 * static_cast(rr.counts.logical_uncompressed_bytes)) - / rr.times.rt_comp_s + / rt_comp_total_s : 0.0; std::stringstream ss; @@ -978,7 +980,7 @@ int main(int argc, char** argv) { static_cast(rr.counts.logical_uncompressed_bytes) / dBps ); rt_nc_t.push_back(rr.times.rt_nocomp_s); - rt_c_t.push_back(rr.times.rt_comp_s); + rt_c_t.push_back(rt_comp_total_s); } } From b0951a147bfc746f8de672e87020216f5e963bd4 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 05:52:09 -0800 Subject: [PATCH 14/26] Fix assumed file sizes --- cpp/benchmarks/bench_comp_comm.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 26ffce259..c025765e6 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -711,9 +711,15 @@ RunResult run_once( Tag send_tag_nc = initiator ? tag_ping_nc : tag_pong_nc; Tag recv_tag_nc = initiator ? tag_pong_nc : tag_ping_nc; for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - std::size_t recv_size = nocomp_payloads[i]->size; - std::unique_ptr send_buf = - alloc_and_copy_device(br, stream, *nocomp_payloads[i]); + // Exchange payload sizes to avoid assuming symmetric sizes across ranks + std::uint64_t local_sz = static_cast(nocomp_payloads[i]->size); + std::uint64_t remote_sz = exchange_u64_header( + comm, br, stream, peer, send_tag_nc, recv_tag_nc, local_sz + ); + std::unique_ptr send_buf; + if (local_sz > 0) { + send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]); + } exchange_payload( comm, br, @@ -722,7 +728,7 @@ RunResult run_once( send_tag_nc, recv_tag_nc, std::move(send_buf), - recv_size + static_cast(remote_sz) ); } auto rt_end = Clock::now(); From ef01b3830671be637101a67f43ec6fb1b63d246a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 06:45:24 -0800 Subject: [PATCH 15/26] Move nvcomp code to librapidsmpf --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/bench_comp_comm.cpp | 179 +---------------------------- cpp/include/rapidsmpf/nvcomp.h | 109 ++++++++++++++++++ cpp/src/nvcomp.cpp | 161 ++++++++++++++++++++++++++ 4 files changed, 272 insertions(+), 178 deletions(-) create mode 100644 cpp/include/rapidsmpf/nvcomp.h create mode 100644 cpp/src/nvcomp.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c473bfd63..da1e92931 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -167,6 +167,7 @@ add_library( src/cuda_event.cpp src/integrations/cudf/partition.cpp src/integrations/cudf/utils.cpp + src/nvcomp.cpp src/pausable_thread_loop.cpp src/progress_thread.cpp src/rmm_resource_adaptor.cpp diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index c025765e6..c7ff27703 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -48,10 +49,6 @@ #include "utils/misc.hpp" #include "utils/rmm_stack.hpp" -// nvCOMP managers (v3.x API) -#include -#include - using namespace rapidsmpf; namespace { @@ -60,19 +57,6 @@ enum class PackMode { Table, Column }; -enum class Algo { - Cascaded, - LZ4 -}; - -struct KvParams { - // Common - std::size_t chunk_size{1 << 20}; - // Cascaded - int cascaded_rle{1}; - int cascaded_delta{1}; - int cascaded_bitpack{1}; -}; struct Args { std::string comm_type{"mpi"}; @@ -172,167 +156,6 @@ struct PhaseThroughputs { double recv_only_Bps{0.0}; }; -class NvcompCodec { - public: - virtual ~NvcompCodec() = default; - virtual std::size_t get_max_compressed_bytes( - std::size_t uncompressed_bytes, rmm::cuda_stream_view stream - ) = 0; - virtual void compress( - void const* d_in, - std::size_t in_bytes, - void* d_out, - std::size_t* out_bytes, - rmm::cuda_stream_view stream - ) = 0; - virtual void decompress( - void const* d_in, - std::size_t in_bytes, - void* d_out, - std::size_t out_bytes, - rmm::cuda_stream_view stream - ) = 0; -}; - -class LZ4Codec final : public NvcompCodec { - public: - explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {} - - std::size_t get_max_compressed_bytes( - std::size_t in_bytes, rmm::cuda_stream_view stream - ) override { - nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; - nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; - nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; - auto cfg = mgr.configure_compression(in_bytes); - return cfg.max_compressed_buffer_size; - } - - void compress( - void const* d_in, - std::size_t in_bytes, - void* d_out, - std::size_t* out_bytes, - rmm::cuda_stream_view stream - ) override { - nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; - nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; - nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; - auto cfg = mgr.configure_compression(in_bytes); - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - pinned_bytes - ); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); - } - - void decompress( - void const* d_in, - std::size_t in_bytes, - void* d_out, - std::size_t out_bytes, - rmm::cuda_stream_view stream - ) override { - (void)out_bytes; - nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; - nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; - nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; - const uint8_t* in_ptrs[1] = {static_cast(d_in)}; - size_t in_sizes[1] = {in_bytes}; - auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); - uint8_t* out_ptrs[1] = {static_cast(d_out)}; - mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); - } - - private: - std::size_t chunk_size_; -}; - -class CascadedCodec final : public NvcompCodec { - public: - CascadedCodec(std::size_t chunk_size, int rle, int delta, int bitpack) - : chunk_size_{chunk_size} { - copts_ = nvcompBatchedCascadedCompressDefaultOpts; - copts_.num_RLEs = rle ? 1 : 0; - copts_.num_deltas = delta ? 1 : 0; - copts_.use_bp = bitpack ? 1 : 0; - dopts_ = nvcompBatchedCascadedDecompressDefaultOpts; - } - - std::size_t get_max_compressed_bytes( - std::size_t in_bytes, rmm::cuda_stream_view stream - ) override { - nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; - auto cfg = mgr.configure_compression(in_bytes); - return cfg.max_compressed_buffer_size; - } - - void compress( - void const* d_in, - std::size_t in_bytes, - void* d_out, - std::size_t* out_bytes, - rmm::cuda_stream_view stream - ) override { - nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; - auto cfg = mgr.configure_compression(in_bytes); - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - pinned_bytes - ); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); - } - - void decompress( - void const* d_in, - std::size_t in_bytes, - void* d_out, - std::size_t out_bytes, - rmm::cuda_stream_view stream - ) override { - (void)out_bytes; - nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; - const uint8_t* in_ptrs[1] = {static_cast(d_in)}; - size_t in_sizes[1] = {in_bytes}; - auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); - uint8_t* out_ptrs[1] = {static_cast(d_out)}; - mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); - } - - private: - std::size_t chunk_size_{}; - nvcompBatchedCascadedCompressOpts_t copts_{}; - nvcompBatchedCascadedDecompressOpts_t dopts_{}; -}; - -std::unique_ptr make_codec(Algo algo, KvParams const& p) { - switch (algo) { - case Algo::LZ4: - return std::make_unique(p.chunk_size); - case Algo::Cascaded: - default: - return std::make_unique( - p.chunk_size, p.cascaded_rle, p.cascaded_delta, p.cascaded_bitpack - ); - } -} - static inline void ensure_ready(Buffer& buf) { if (!buf.is_latest_write_done()) { buf.stream().synchronize(); diff --git a/cpp/include/rapidsmpf/nvcomp.h b/cpp/include/rapidsmpf/nvcomp.h new file mode 100644 index 000000000..055fa41a2 --- /dev/null +++ b/cpp/include/rapidsmpf/nvcomp.h @@ -0,0 +1,109 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +#include + +namespace rapidsmpf { + +enum class Algo { + Cascaded, + LZ4 +}; + +/** + * @brief Parameters for nvCOMP codec configuration + * + * Holds configuration parameters for both generic and algorithm-specific compression + * settings. + */ +struct KvParams { + /// Chunk size for compression operations (default: 1 MiB) + std::size_t chunk_size{1 << 20}; + + /// Number of run-length encoding passes in Cascaded codec (must be non-negative, + /// default: 1) + int cascaded_rle{1}; + + /// Number of delta encoding passes in Cascaded codec (must be non-negative, default: + /// 1) + int cascaded_delta{1}; + + /// Enable bitpacking in Cascaded codec (default: enabled) + bool cascaded_bitpack{true}; +}; + +/** + * @brief Abstract base class for nvCOMP codec implementations + * + * Provides a unified interface for different compression algorithms (LZ4, Cascaded, etc.) + * to perform compression and decompression operations on GPU device memory. + */ +class NvcompCodec { + public: + virtual ~NvcompCodec() = default; + + /** + * @brief Calculate the maximum compressed size for the given input size + * + * @param uncompressed_bytes Size of the uncompressed data in bytes + * @param stream CUDA stream for operations + * @return Maximum possible compressed size in bytes + */ + virtual std::size_t get_max_compressed_bytes( + std::size_t uncompressed_bytes, rmm::cuda_stream_view stream + ) = 0; + + /** + * @brief Compress data on the GPU + * + * @param d_in Pointer to uncompressed data on device + * @param in_bytes Size of uncompressed data in bytes + * @param d_out Pointer to output buffer on device for compressed data + * @param out_bytes Pointer to output variable that will be set to actual compressed + * size + * @param stream CUDA stream for operations + */ + virtual void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) = 0; + + /** + * @brief Decompress data on the GPU + * + * @param d_in Pointer to compressed data on device + * @param in_bytes Size of compressed data in bytes + * @param d_out Pointer to output buffer on device for decompressed data + * @param out_bytes Expected size of decompressed data in bytes + * @param stream CUDA stream for operations + */ + virtual void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) = 0; +}; + +/** + * @brief Create an nvCOMP codec instance + * + * @param algo The compression algorithm to use + * @param p Parameters for the codec + * @return A unique pointer to an NvcompCodec instance + */ +std::unique_ptr make_codec(Algo algo, KvParams const& p); + +} // namespace rapidsmpf diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp new file mode 100644 index 000000000..612399c38 --- /dev/null +++ b/cpp/src/nvcomp.cpp @@ -0,0 +1,161 @@ +/** + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +namespace rapidsmpf { + +class LZ4Codec final : public NvcompCodec { + public: + explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {} + + std::size_t get_max_compressed_bytes( + std::size_t in_bytes, rmm::cuda_stream_view stream + ) override { + nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; + nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + return cfg.max_compressed_buffer_size; + } + + void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; + nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } + + void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) override { + (void)out_bytes; + nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; + nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; + nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; + const uint8_t* in_ptrs[1] = {static_cast(d_in)}; + size_t in_sizes[1] = {in_bytes}; + auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); + uint8_t* out_ptrs[1] = {static_cast(d_out)}; + mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); + } + + private: + std::size_t chunk_size_; +}; + +class CascadedCodec final : public NvcompCodec { + public: + CascadedCodec(std::size_t chunk_size, int rle, int delta, bool bitpack) + : chunk_size_{chunk_size} { + copts_ = nvcompBatchedCascadedCompressDefaultOpts; + copts_.num_RLEs = rle; + copts_.num_deltas = delta; + copts_.use_bp = bitpack ? 1 : 0; + dopts_ = nvcompBatchedCascadedDecompressDefaultOpts; + } + + std::size_t get_max_compressed_bytes( + std::size_t in_bytes, rmm::cuda_stream_view stream + ) override { + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + return cfg.max_compressed_buffer_size; + } + + void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } + + void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) override { + (void)out_bytes; + nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; + const uint8_t* in_ptrs[1] = {static_cast(d_in)}; + size_t in_sizes[1] = {in_bytes}; + auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); + uint8_t* out_ptrs[1] = {static_cast(d_out)}; + mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); + } + + private: + std::size_t chunk_size_{}; + nvcompBatchedCascadedCompressOpts_t copts_{}; + nvcompBatchedCascadedDecompressOpts_t dopts_{}; +}; + +std::unique_ptr make_codec(Algo algo, KvParams const& p) { + switch (algo) { + case Algo::LZ4: + return std::make_unique(p.chunk_size); + case Algo::Cascaded: + default: + return std::make_unique( + p.chunk_size, p.cascaded_rle, p.cascaded_delta, p.cascaded_bitpack + ); + } +} + +} // namespace rapidsmpf From 179886d5d9031acd0299b4339c6cc76556f8003a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 09:16:20 -0800 Subject: [PATCH 16/26] Ensure packed data is immediately completed --- cpp/benchmarks/bench_comp_comm.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index c7ff27703..76b98b9d8 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -246,9 +246,11 @@ static std::unique_ptr pack_table_to_packed( cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br ) { auto packed = cudf::pack(tv, stream, br->device_mr()); - return std::make_unique( + auto ret = std::make_unique( std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream) ); + ret->data->stream().synchronize(); + return ret; } struct ArgumentParser { @@ -496,7 +498,7 @@ RunResult run_once( continue; } // Ensure any prior writes to input are completed - data.items[i].packed->data->stream().synchronize(); + // data.items[i].packed->data->stream().synchronize(); // Launch compression on the output buffer's stream and record an event after comp_outputs[i]->write_access( [&codec, &data, i, in_bytes, &comp_output_sizes, stream]( From 1e235b6b952ef364b6e784e2e3c15722ace1d033 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 09:46:21 -0800 Subject: [PATCH 17/26] Multi-stream compression --- cpp/benchmarks/bench_comp_comm.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 76b98b9d8..20b7936a5 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -491,38 +491,42 @@ RunResult run_once( RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = Clock::now(); // Compress all items (single batch) on stream + std::vector comp_streams(data.items.size()); for (std::size_t i = 0; i < data.items.size(); ++i) { auto const in_bytes = data.items[i].packed->data->size; if (in_bytes == 0) { comp_output_sizes[i] = 0; continue; } - // Ensure any prior writes to input are completed - // data.items[i].packed->data->stream().synchronize(); // Launch compression on the output buffer's stream and record an event after comp_outputs[i]->write_access( - [&codec, &data, i, in_bytes, &comp_output_sizes, stream]( + [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams]( std::byte* out_ptr, rmm::cuda_stream_view out_stream ) { (void)out_ptr; // pointer used below // Lock input for raw pointer access auto* in_raw = data.items[i].packed->data->exclusive_data_access(); - std::size_t out_bytes = 0; codec.compress( static_cast(in_raw), in_bytes, static_cast(out_ptr), - &out_bytes, + &comp_output_sizes[i], out_stream ); - // Ensure comp_bytes is populated before returning - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(out_stream.value())); - data.items[i].packed->data->unlock(); - comp_output_sizes[i] = out_bytes; + // Defer synchronization and unlock; record stream for later sync + comp_streams[i] = out_stream; } ); } - RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); + // Synchronize streams and unlock inputs + for (std::size_t i = 0; i < data.items.size(); ++i) { + auto const in_bytes = data.items[i].packed->data->size; + if (in_bytes == 0) { + continue; + } + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(comp_streams[i].value())); + data.items[i].packed->data->unlock(); + } auto t1 = Clock::now(); // Phase A (RTT no compression): ping-pong per op (sequential per item to avoid From 4a79b842cc4b34ee6a34befb6448e10da7a49da6 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 09:49:05 -0800 Subject: [PATCH 18/26] Multi-stream decompression --- cpp/benchmarks/bench_comp_comm.cpp | 42 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 20b7936a5..4ac59066c 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -606,6 +606,7 @@ RunResult run_once( // Decompress received buffers (simulate by decompressing our own produced outputs in // symmetric setup) auto c0 = Clock::now(); + std::vector decomp_streams(data.items.size()); for (std::size_t i = 0; i < data.items.size(); ++i) { auto const out_bytes = data.items[i].packed->data->size; if (out_bytes == 0) { @@ -613,23 +614,32 @@ RunResult run_once( } auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE); auto out = br->allocate(out_bytes, stream, res); - // Ensure compressed outputs are ready before using as input - comp_outputs[i]->stream().synchronize(); - out->write_access([&codec, &comp_outputs, &comp_output_sizes, i, out_bytes]( - std::byte* out_ptr, rmm::cuda_stream_view out_stream - ) { - auto* in_raw = comp_outputs[i]->exclusive_data_access(); - codec.decompress( - static_cast(in_raw), - comp_output_sizes[i], - static_cast(out_ptr), - out_bytes, - out_stream - ); - comp_outputs[i]->unlock(); - }); + out->write_access( + [&codec, &comp_outputs, &comp_output_sizes, &decomp_streams, i, out_bytes]( + std::byte* out_ptr, rmm::cuda_stream_view out_stream + ) { + auto* in_raw = comp_outputs[i]->exclusive_data_access(); + codec.decompress( + static_cast(in_raw), + comp_output_sizes[i], + static_cast(out_ptr), + out_bytes, + out_stream + ); + // Defer unlock until after per-stream synchronization + decomp_streams[i] = out_stream; + } + ); + } + // Synchronize each decomp stream and then unlock the corresponding input + for (std::size_t i = 0; i < data.items.size(); ++i) { + auto const out_bytes = data.items[i].packed->data->size; + if (out_bytes == 0) { + continue; + } + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(decomp_streams[i].value())); + comp_outputs[i]->unlock(); } - RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto c1 = Clock::now(); RunResult result{}; From c3d6c792383268839da33e58991b5a6d95570cb7 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 11:11:28 -0800 Subject: [PATCH 19/26] Add Zstd and Snappy compression --- cpp/benchmarks/bench_comp_comm.cpp | 10 ++- cpp/include/rapidsmpf/nvcomp.h | 4 +- cpp/src/nvcomp.cpp | 133 +++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 4ac59066c..8032b509c 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -301,9 +301,14 @@ struct ArgumentParser { args_.algo = Algo::Cascaded; else if (v == "lz4") args_.algo = Algo::LZ4; + else if (v == "zstd") + args_.algo = Algo::Zstd; + else if (v == "snappy") + args_.algo = Algo::Snappy; else RAPIDSMPF_FAIL( - "-A must be one of {cascaded, lz4}", std::invalid_argument + "-A must be one of {cascaded, lz4, zstd, snappy}", + std::invalid_argument ); break; } @@ -331,7 +336,8 @@ struct ArgumentParser { << " -F Parquet file glob/pattern (required)\n" << " -P Packing mode {table, column} (default: " "table)\n" - << " -A {cascaded, lz4} (default: cascaded)\n" + << " -A {cascaded, lz4, zstd, snappy} (default: " + "cascaded)\n" << " -K Algo params, e.g. " "chunk_size=1MiB,delta=1,rle=1,bitpack=1\n" << " -p Number of concurrent ops (default: 1)\n" diff --git a/cpp/include/rapidsmpf/nvcomp.h b/cpp/include/rapidsmpf/nvcomp.h index 055fa41a2..ef4d367fd 100644 --- a/cpp/include/rapidsmpf/nvcomp.h +++ b/cpp/include/rapidsmpf/nvcomp.h @@ -15,7 +15,9 @@ namespace rapidsmpf { enum class Algo { Cascaded, - LZ4 + LZ4, + Zstd, + Snappy }; /** diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp index 612399c38..280135235 100644 --- a/cpp/src/nvcomp.cpp +++ b/cpp/src/nvcomp.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rapidsmpf { @@ -146,10 +148,141 @@ class CascadedCodec final : public NvcompCodec { nvcompBatchedCascadedDecompressOpts_t dopts_{}; }; +class SnappyCodec final : public NvcompCodec { + public: + explicit SnappyCodec(std::size_t chunk_size) : chunk_size_{chunk_size} {} + + std::size_t get_max_compressed_bytes( + std::size_t in_bytes, rmm::cuda_stream_view stream + ) override { + nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts; + nvcompBatchedSnappyDecompressOpts_t dopts = + nvcompBatchedSnappyDecompressDefaultOpts; + nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + return cfg.max_compressed_buffer_size; + } + + void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts; + nvcompBatchedSnappyDecompressOpts_t dopts = + nvcompBatchedSnappyDecompressDefaultOpts; + nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } + + void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) override { + (void)out_bytes; + nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts; + nvcompBatchedSnappyDecompressOpts_t dopts = + nvcompBatchedSnappyDecompressDefaultOpts; + nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()}; + const uint8_t* in_ptrs[1] = {static_cast(d_in)}; + size_t in_sizes[1] = {in_bytes}; + auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); + uint8_t* out_ptrs[1] = {static_cast(d_out)}; + mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); + } + + private: + std::size_t chunk_size_; +}; + +class ZstdCodec final : public NvcompCodec { + public: + explicit ZstdCodec(std::size_t chunk_size) : chunk_size_{chunk_size} {} + + std::size_t get_max_compressed_bytes( + std::size_t in_bytes, rmm::cuda_stream_view stream + ) override { + nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts; + nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts; + nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + return cfg.max_compressed_buffer_size; + } + + void compress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t* out_bytes, + rmm::cuda_stream_view stream + ) override { + nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts; + nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts; + nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()}; + auto cfg = mgr.configure_compression(in_bytes); + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } + + void decompress( + void const* d_in, + std::size_t in_bytes, + void* d_out, + std::size_t out_bytes, + rmm::cuda_stream_view stream + ) override { + (void)out_bytes; + nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts; + nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts; + nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()}; + const uint8_t* in_ptrs[1] = {static_cast(d_in)}; + size_t in_sizes[1] = {in_bytes}; + auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes); + uint8_t* out_ptrs[1] = {static_cast(d_out)}; + mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr); + } + + private: + std::size_t chunk_size_; +}; + std::unique_ptr make_codec(Algo algo, KvParams const& p) { switch (algo) { case Algo::LZ4: return std::make_unique(p.chunk_size); + case Algo::Zstd: + return std::make_unique(p.chunk_size); + case Algo::Snappy: + return std::make_unique(p.chunk_size); case Algo::Cascaded: default: return std::make_unique( From a671b4724154538694eb277cce9d3ae329893fb8 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 11 Nov 2025 13:56:43 -0800 Subject: [PATCH 20/26] Allow passing BufferResource to compress() --- cpp/benchmarks/bench_comp_comm.cpp | 5 +- cpp/include/rapidsmpf/nvcomp.h | 7 +- cpp/src/nvcomp.cpp | 193 ++++++++++++++++++++--------- 3 files changed, 146 insertions(+), 59 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 8032b509c..f2cccb316 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -506,7 +506,7 @@ RunResult run_once( } // Launch compression on the output buffer's stream and record an event after comp_outputs[i]->write_access( - [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams]( + [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams, br]( std::byte* out_ptr, rmm::cuda_stream_view out_stream ) { (void)out_ptr; // pointer used below @@ -517,7 +517,8 @@ RunResult run_once( in_bytes, static_cast(out_ptr), &comp_output_sizes[i], - out_stream + out_stream, + br ); // Defer synchronization and unlock; record stream for later sync comp_streams[i] = out_stream; diff --git a/cpp/include/rapidsmpf/nvcomp.h b/cpp/include/rapidsmpf/nvcomp.h index ef4d367fd..5b4038348 100644 --- a/cpp/include/rapidsmpf/nvcomp.h +++ b/cpp/include/rapidsmpf/nvcomp.h @@ -13,6 +13,8 @@ namespace rapidsmpf { +class BufferResource; // forward declaration + enum class Algo { Cascaded, LZ4, @@ -72,13 +74,16 @@ class NvcompCodec { * @param out_bytes Pointer to output variable that will be set to actual compressed * size * @param stream CUDA stream for operations + * @param br Optional buffer resource used for temporary allocations (e.g., to capture + * compressed size on device and copy back to host). */ virtual void compress( void const* d_in, std::size_t in_bytes, void* d_out, std::size_t* out_bytes, - rmm::cuda_stream_view stream + rmm::cuda_stream_view stream, + BufferResource* br = nullptr ) = 0; /** diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp index 280135235..5091b69d0 100644 --- a/cpp/src/nvcomp.cpp +++ b/cpp/src/nvcomp.cpp @@ -12,6 +12,7 @@ #include +#include #include #include @@ -40,25 +41,45 @@ class LZ4Codec final : public NvcompCodec { std::size_t in_bytes, void* d_out, std::size_t* out_bytes, - rmm::cuda_stream_view stream + rmm::cuda_stream_view stream, + BufferResource* br ) override { nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts; nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - pinned_bytes - ); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + if (br != nullptr) { + auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); + auto size_buf = br->allocate(sizeof(size_t), stream, reservation); + size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + reinterpret_cast(sz_ptr) + ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s + )); + }); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + } else { + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), + sizeof(size_t), + cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } } void decompress( @@ -107,23 +128,43 @@ class CascadedCodec final : public NvcompCodec { std::size_t in_bytes, void* d_out, std::size_t* out_bytes, - rmm::cuda_stream_view stream + rmm::cuda_stream_view stream, + BufferResource* br ) override { nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - pinned_bytes - ); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + if (br != nullptr) { + auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); + auto size_buf = br->allocate(sizeof(size_t), stream, reservation); + size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + reinterpret_cast(sz_ptr) + ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s + )); + }); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + } else { + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), + sizeof(size_t), + cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } } void decompress( @@ -168,26 +209,46 @@ class SnappyCodec final : public NvcompCodec { std::size_t in_bytes, void* d_out, std::size_t* out_bytes, - rmm::cuda_stream_view stream + rmm::cuda_stream_view stream, + BufferResource* br ) override { nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts; nvcompBatchedSnappyDecompressOpts_t dopts = nvcompBatchedSnappyDecompressDefaultOpts; nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - pinned_bytes - ); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + if (br != nullptr) { + auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); + auto size_buf = br->allocate(sizeof(size_t), stream, reservation); + size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + reinterpret_cast(sz_ptr) + ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s + )); + }); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + } else { + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), + sizeof(size_t), + cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } } void decompress( @@ -232,25 +293,45 @@ class ZstdCodec final : public NvcompCodec { std::size_t in_bytes, void* d_out, std::size_t* out_bytes, - rmm::cuda_stream_view stream + rmm::cuda_stream_view stream, + BufferResource* br ) override { nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts; nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts; nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - pinned_bytes - ); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + if (br != nullptr) { + auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); + auto size_buf = br->allocate(sizeof(size_t), stream, reservation); + size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + reinterpret_cast(sz_ptr) + ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s + )); + }); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + } else { + size_t* pinned_bytes = nullptr; + RAPIDSMPF_CUDA_TRY(cudaHostAlloc( + reinterpret_cast(&pinned_bytes), + sizeof(size_t), + cudaHostAllocDefault + )); + mgr.compress( + static_cast(d_in), + static_cast(d_out), + cfg, + pinned_bytes + ); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + *out_bytes = *pinned_bytes; + RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + } } void decompress( From 4cc54e4a8e3026b44870989b421229f4b2ad6d40 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 12 Nov 2025 01:40:45 -0800 Subject: [PATCH 21/26] Use cudaMallocAsync when BufferResource is not specified --- cpp/src/nvcomp.cpp | 64 +++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp index 5091b69d0..e39001797 100644 --- a/cpp/src/nvcomp.cpp +++ b/cpp/src/nvcomp.cpp @@ -64,21 +64,21 @@ class LZ4Codec final : public NvcompCodec { }); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); } else { - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), - sizeof(size_t), - cudaHostAllocDefault + size_t* d_size = nullptr; + RAPIDSMPF_CUDA_TRY(cudaMallocAsync( + reinterpret_cast(&d_size), sizeof(size_t), stream.value() )); mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - pinned_bytes + d_size ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() + )); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); } } @@ -149,21 +149,21 @@ class CascadedCodec final : public NvcompCodec { }); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); } else { - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), - sizeof(size_t), - cudaHostAllocDefault + size_t* d_size = nullptr; + RAPIDSMPF_CUDA_TRY(cudaMallocAsync( + reinterpret_cast(&d_size), sizeof(size_t), stream.value() )); mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - pinned_bytes + d_size ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() + )); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); } } @@ -233,21 +233,21 @@ class SnappyCodec final : public NvcompCodec { }); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); } else { - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), - sizeof(size_t), - cudaHostAllocDefault + size_t* d_size = nullptr; + RAPIDSMPF_CUDA_TRY(cudaMallocAsync( + reinterpret_cast(&d_size), sizeof(size_t), stream.value() )); mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - pinned_bytes + d_size ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() + )); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); } } @@ -316,21 +316,21 @@ class ZstdCodec final : public NvcompCodec { }); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); } else { - size_t* pinned_bytes = nullptr; - RAPIDSMPF_CUDA_TRY(cudaHostAlloc( - reinterpret_cast(&pinned_bytes), - sizeof(size_t), - cudaHostAllocDefault + size_t* d_size = nullptr; + RAPIDSMPF_CUDA_TRY(cudaMallocAsync( + reinterpret_cast(&d_size), sizeof(size_t), stream.value() )); mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - pinned_bytes + d_size ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() + )); RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - *out_bytes = *pinned_bytes; - RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes)); + RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); } } From 5adcea74efd48e60a6ff6818baadadad07912cfc Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 12 Nov 2025 01:44:22 -0800 Subject: [PATCH 22/26] Use a helper allocation function --- cpp/src/nvcomp.cpp | 173 +++++++++++++++++---------------------------- 1 file changed, 65 insertions(+), 108 deletions(-) diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp index e39001797..9abad0310 100644 --- a/cpp/src/nvcomp.cpp +++ b/cpp/src/nvcomp.cpp @@ -22,6 +22,39 @@ namespace rapidsmpf { +namespace { +template +void invoke_compress_with_device_size_buffer( + InvokeCompressFn&& invoke_compress, + std::size_t* out_bytes, + rmm::cuda_stream_view stream, + BufferResource* br +) { + if (br != nullptr) { + auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); + auto size_buf = br->allocate(sizeof(size_t), stream, reservation); + size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + invoke_compress(reinterpret_cast(sz_ptr)); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s + )); + }); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + } else { + size_t* d_size = nullptr; + RAPIDSMPF_CUDA_TRY(cudaMallocAsync( + reinterpret_cast(&d_size), sizeof(size_t), stream.value() + )); + invoke_compress(d_size); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() + )); + RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); + RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); + } +} +} // namespace + class LZ4Codec final : public NvcompCodec { public: explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {} @@ -48,38 +81,19 @@ class LZ4Codec final : public NvcompCodec { nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts; nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - if (br != nullptr) { - auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); - auto size_buf = br->allocate(sizeof(size_t), stream, reservation); - size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + invoke_compress_with_device_size_buffer( + [&](size_t* sz_ptr) { mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - reinterpret_cast(sz_ptr) + sz_ptr ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s - )); - }); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - } else { - size_t* d_size = nullptr; - RAPIDSMPF_CUDA_TRY(cudaMallocAsync( - reinterpret_cast(&d_size), sizeof(size_t), stream.value() - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - d_size - ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() - )); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); - } + }, + out_bytes, + stream, + br + ); } void decompress( @@ -133,38 +147,19 @@ class CascadedCodec final : public NvcompCodec { ) override { nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - if (br != nullptr) { - auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); - auto size_buf = br->allocate(sizeof(size_t), stream, reservation); - size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + invoke_compress_with_device_size_buffer( + [&](size_t* sz_ptr) { mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - reinterpret_cast(sz_ptr) + sz_ptr ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s - )); - }); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - } else { - size_t* d_size = nullptr; - RAPIDSMPF_CUDA_TRY(cudaMallocAsync( - reinterpret_cast(&d_size), sizeof(size_t), stream.value() - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - d_size - ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() - )); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); - } + }, + out_bytes, + stream, + br + ); } void decompress( @@ -217,38 +212,19 @@ class SnappyCodec final : public NvcompCodec { nvcompBatchedSnappyDecompressDefaultOpts; nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - if (br != nullptr) { - auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); - auto size_buf = br->allocate(sizeof(size_t), stream, reservation); - size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + invoke_compress_with_device_size_buffer( + [&](size_t* sz_ptr) { mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - reinterpret_cast(sz_ptr) + sz_ptr ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s - )); - }); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - } else { - size_t* d_size = nullptr; - RAPIDSMPF_CUDA_TRY(cudaMallocAsync( - reinterpret_cast(&d_size), sizeof(size_t), stream.value() - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - d_size - ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() - )); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); - } + }, + out_bytes, + stream, + br + ); } void decompress( @@ -300,38 +276,19 @@ class ZstdCodec final : public NvcompCodec { nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts; nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()}; auto cfg = mgr.configure_compression(in_bytes); - if (br != nullptr) { - auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE); - auto size_buf = br->allocate(sizeof(size_t), stream, reservation); - size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) { + invoke_compress_with_device_size_buffer( + [&](size_t* sz_ptr) { mgr.compress( static_cast(d_in), static_cast(d_out), cfg, - reinterpret_cast(sz_ptr) + sz_ptr ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s - )); - }); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - } else { - size_t* d_size = nullptr; - RAPIDSMPF_CUDA_TRY(cudaMallocAsync( - reinterpret_cast(&d_size), sizeof(size_t), stream.value() - )); - mgr.compress( - static_cast(d_in), - static_cast(d_out), - cfg, - d_size - ); - RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value() - )); - RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value())); - RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value())); - } + }, + out_bytes, + stream, + br + ); } void decompress( From 721dda91947dc0e6e6f1237882db4c74b353c834 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 12 Nov 2025 02:09:25 -0800 Subject: [PATCH 23/26] Gather packing time --- cpp/benchmarks/bench_comp_comm.cpp | 75 ++++++++++++++++++------------ 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index f2cccb316..4b39b1ea4 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -430,6 +430,7 @@ struct SizeHeader { }; struct Timings { + double pack_s{0.0}; double compress_s{0.0}; double decompress_s{0.0}; // Round-trip totals measured at initiator @@ -454,7 +455,8 @@ RunResult run_once( rmm::cuda_stream_view stream, BufferResource* br, std::shared_ptr const& statistics, - BuffersToSend const& data, + cudf::table const& table, + PackMode pack_mode, NvcompCodec& codec, std::uint64_t run_index ) { @@ -469,10 +471,15 @@ RunResult run_once( Tag tag_ping_c{11, 0}; Tag tag_pong_c{11, 1}; + // Pack data per iteration + auto p0 = Clock::now(); + auto packed = make_packed_items(table, pack_mode, stream, br); + auto p1 = Clock::now(); + // Clone packed items into raw device buffers for repeated ops std::vector> nocomp_payloads; - nocomp_payloads.reserve(data.items.size()); - for (auto const& it : data.items) { + nocomp_payloads.reserve(packed.items.size()); + for (auto const& it : packed.items) { // Copy metadata + data into a contiguous device buffer for pure send path? // For pure send/recv, we only send the device payload; metadata isn't needed for // metrics. We'll send the packed->data buffer. @@ -484,10 +491,10 @@ RunResult run_once( // Pre-allocate compression outputs for each item std::vector> comp_outputs; - std::vector comp_output_sizes(data.items.size()); - comp_outputs.reserve(data.items.size()); - for (std::size_t i = 0; i < data.items.size(); ++i) { - auto const in_bytes = data.items[i].packed->data->size; + std::vector comp_output_sizes(packed.items.size()); + comp_outputs.reserve(packed.items.size()); + for (std::size_t i = 0; i < packed.items.size(); ++i) { + auto const in_bytes = packed.items[i].packed->data->size; std::size_t const max_out = (in_bytes == 0) ? 1 : codec.get_max_compressed_bytes(in_bytes, stream); auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE); @@ -497,21 +504,21 @@ RunResult run_once( RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = Clock::now(); // Compress all items (single batch) on stream - std::vector comp_streams(data.items.size()); - for (std::size_t i = 0; i < data.items.size(); ++i) { - auto const in_bytes = data.items[i].packed->data->size; + std::vector comp_streams(packed.items.size()); + for (std::size_t i = 0; i < packed.items.size(); ++i) { + auto const in_bytes = packed.items[i].packed->data->size; if (in_bytes == 0) { comp_output_sizes[i] = 0; continue; } // Launch compression on the output buffer's stream and record an event after comp_outputs[i]->write_access( - [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams, br]( + [&codec, &packed, i, in_bytes, &comp_output_sizes, &comp_streams, br]( std::byte* out_ptr, rmm::cuda_stream_view out_stream ) { (void)out_ptr; // pointer used below // Lock input for raw pointer access - auto* in_raw = data.items[i].packed->data->exclusive_data_access(); + auto* in_raw = packed.items[i].packed->data->exclusive_data_access(); codec.compress( static_cast(in_raw), in_bytes, @@ -526,13 +533,13 @@ RunResult run_once( ); } // Synchronize streams and unlock inputs - for (std::size_t i = 0; i < data.items.size(); ++i) { - auto const in_bytes = data.items[i].packed->data->size; + for (std::size_t i = 0; i < packed.items.size(); ++i) { + auto const in_bytes = packed.items[i].packed->data->size; if (in_bytes == 0) { continue; } RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(comp_streams[i].value())); - data.items[i].packed->data->unlock(); + packed.items[i].packed->data->unlock(); } auto t1 = Clock::now(); @@ -581,7 +588,7 @@ RunResult run_once( Rank peer = initiator ? dst : src; Tag send_tag_c = initiator ? tag_ping_c : tag_pong_c; Tag recv_tag_c = initiator ? tag_pong_c : tag_ping_c; - for (std::size_t i = 0; i < data.items.size(); ++i) { + for (std::size_t i = 0; i < packed.items.size(); ++i) { // Header exchange: send our size, receive peer size std::uint64_t local_sz = static_cast(comp_output_sizes[i]); std::uint64_t remote_sz = exchange_u64_header( @@ -613,9 +620,9 @@ RunResult run_once( // Decompress received buffers (simulate by decompressing our own produced outputs in // symmetric setup) auto c0 = Clock::now(); - std::vector decomp_streams(data.items.size()); - for (std::size_t i = 0; i < data.items.size(); ++i) { - auto const out_bytes = data.items[i].packed->data->size; + std::vector decomp_streams(packed.items.size()); + for (std::size_t i = 0; i < packed.items.size(); ++i) { + auto const out_bytes = packed.items[i].packed->data->size; if (out_bytes == 0) { continue; } @@ -639,8 +646,8 @@ RunResult run_once( ); } // Synchronize each decomp stream and then unlock the corresponding input - for (std::size_t i = 0; i < data.items.size(); ++i) { - auto const out_bytes = data.items[i].packed->data->size; + for (std::size_t i = 0; i < packed.items.size(); ++i) { + auto const out_bytes = packed.items[i].packed->data->size; if (out_bytes == 0) { continue; } @@ -650,13 +657,14 @@ RunResult run_once( auto c1 = Clock::now(); RunResult result{}; + result.times.pack_s = std::chrono::duration(p1 - p0).count(); result.times.compress_s = std::chrono::duration(t1 - t0).count(); result.times.rt_nocomp_s = rt_nc_total.count(); result.times.rt_comp_s = rt_c_total.count(); result.times.decompress_s = std::chrono::duration(c1 - c0).count(); // Use payload (device) bytes as the logical uncompressed size for throughput - result.counts.logical_uncompressed_bytes = data.total_payload_bytes * args.num_ops; + result.counts.logical_uncompressed_bytes = packed.total_payload_bytes * args.num_ops; result.counts.logical_compressed_bytes = std::accumulate( comp_output_sizes.begin(), comp_output_sizes.end(), std::size_t{0} @@ -769,28 +777,30 @@ int main(int argc, char** argv) { auto table_with_md = cudf::io::read_parquet(reader_opts); auto& table = table_with_md.tbl; - // Pack per mode - auto packed = make_packed_items(*table, args.pack_mode, stream, &br); - // Prepare codec auto codec = make_codec(args.algo, args.params); // Runs - std::vector compress_t, decompress_t, rt_nc_t, rt_c_t; + std::vector pack_t, compress_t, decompress_t, rt_nc_t, rt_c_t; + pack_t.reserve(args.num_runs); compress_t.reserve(args.num_runs); decompress_t.reserve(args.num_runs); rt_nc_t.reserve(args.num_runs); rt_c_t.reserve(args.num_runs); - std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops; + std::size_t logical_bytes = 0; std::size_t logical_compressed_bytes_last = 0; for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) { if (i == args.num_warmups + args.num_runs - 1) { stats = std::make_shared(/* enable = */ true); } - auto rr = run_once(comm, args, stream, &br, stats, packed, *codec, i); + auto rr = + run_once(comm, args, stream, &br, stats, *table, args.pack_mode, *codec, i); + logical_bytes = rr.counts.logical_uncompressed_bytes; + double pBps = + static_cast(rr.counts.logical_uncompressed_bytes) / rr.times.pack_s; double cBps = static_cast(rr.counts.logical_uncompressed_bytes) / rr.times.compress_s; double dBps = static_cast(rr.counts.logical_uncompressed_bytes) @@ -815,7 +825,7 @@ int main(int argc, char** argv) { : 0.0; std::stringstream ss; - ss << "compress: " << format_nbytes(cBps) + ss << "pack: " << format_nbytes(pBps) << "/s | compress: " << format_nbytes(cBps) << "/s | decompress: " << format_nbytes(dBps) << "/s | rt(nocomp): " << format_nbytes(rt_nc_Bps) << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s" @@ -825,6 +835,9 @@ int main(int argc, char** argv) { log.print(ss.str()); if (i >= args.num_warmups) { + pack_t.push_back( + static_cast(rr.counts.logical_uncompressed_bytes) / pBps + ); compress_t.push_back( static_cast(rr.counts.logical_uncompressed_bytes) / cBps ); @@ -845,13 +858,15 @@ int main(int argc, char** argv) { }; if (!compress_t.empty()) { + double mean_elapsed_p = harmonic_mean(pack_t); double mean_elapsed_c = harmonic_mean(compress_t); double mean_elapsed_d = harmonic_mean(decompress_t); double mean_rt_nc = harmonic_mean(rt_nc_t); double mean_rt_c = harmonic_mean(rt_c_t); std::stringstream ss; - ss << "means: compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s" + ss << "means: pack: " << format_nbytes(logical_bytes / mean_elapsed_p) << "/s" + << " | compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s" << " | decompress: " << format_nbytes(logical_bytes / mean_elapsed_d) << "/s" << " | rt(nocomp): " << format_nbytes((2.0 * static_cast(logical_bytes)) / mean_rt_nc) From 73a17dd5869eb198e8f3573513564b09bc7eae69 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 12 Nov 2025 02:52:25 -0800 Subject: [PATCH 24/26] Make individual column packing non-blocking --- cpp/benchmarks/bench_comp_comm.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 4b39b1ea4..d40425c6e 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -249,7 +249,6 @@ static std::unique_ptr pack_table_to_packed( auto ret = std::make_unique( std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream) ); - ret->data->stream().synchronize(); return ret; } @@ -407,6 +406,8 @@ BuffersToSend make_packed_items( if (mode == PackMode::Table) { auto item = PackedItem{}; item.packed = pack_table_to_packed(table.view(), stream, br); + // Ensure the pack is completed before using the buffer downstream + ensure_ready(*item.packed->data); ret.total_uncompressed_bytes += item.packed->data->size; ret.total_payload_bytes += item.packed->data->size; ret.items.emplace_back(std::move(item)); @@ -420,6 +421,10 @@ BuffersToSend make_packed_items( ret.total_payload_bytes += item.packed->data->size; ret.items.emplace_back(std::move(item)); } + // Synchronize all packs after launching them to avoid per-iteration blocking + for (std::size_t i = 0; i < ret.items.size(); ++i) { + ensure_ready(*ret.items[i].packed->data); + } } return ret; } From bbb555ab1e966f1093601217b5c51b216ab9d606 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 12 Nov 2025 03:21:31 -0800 Subject: [PATCH 25/26] Reorganization and cleanup --- cpp/benchmarks/bench_comp_comm.cpp | 424 +++++++++++++---------------- 1 file changed, 196 insertions(+), 228 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index d40425c6e..64e5dcb40 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -39,7 +39,6 @@ #include #include #include -#include #include #ifdef RAPIDSMPF_HAVE_CUPTI @@ -72,186 +71,6 @@ struct Args { std::string cupti_csv_prefix; }; -std::vector expand_glob(std::string const& pattern) { - std::vector files; - glob_t glob_result{}; - int rc = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result); - if (rc == 0) { - for (std::size_t i = 0; i < glob_result.gl_pathc; ++i) { - files.emplace_back(glob_result.gl_pathv[i]); - } - } - globfree(&glob_result); - return files; -} - -std::size_t parse_nbytes(std::string const& s) { - // Simple parser: supports suffixes KiB, MiB, GiB, KB, MB, GB, or no suffix. - auto to_lower = [](char c) { return static_cast(std::tolower(c)); }; - std::string v; - v.reserve(s.size()); - for (char c : s) - v.push_back(to_lower(c)); - - std::size_t mult = 1; - if (v.ends_with("kib")) { - mult = 1ull << 10; - v = v.substr(0, v.size() - 3); - } else if (v.ends_with("mib")) { - mult = 1ull << 20; - v = v.substr(0, v.size() - 3); - } else if (v.ends_with("gib")) { - mult = 1ull << 30; - v = v.substr(0, v.size() - 3); - } else if (v.ends_with("kb")) { - mult = 1000ull; - v = v.substr(0, v.size() - 2); - } else if (v.ends_with("mb")) { - mult = 1000ull * 1000ull; - v = v.substr(0, v.size() - 2); - } else if (v.ends_with("gb")) { - mult = 1000ull * 1000ull * 1000ull; - v = v.substr(0, v.size() - 2); - } - - return static_cast(std::stoll(v)) * mult; -} - -KvParams parse_kv_params(std::string const& kv) { - KvParams p{}; - if (kv.empty()) - return p; - std::size_t start = 0; - while (start < kv.size()) { - auto comma = kv.find(',', start); - auto part = kv.substr( - start, comma == std::string::npos ? std::string::npos : comma - start - ); - auto eq = part.find('='); - if (eq != std::string::npos) { - std::string key = part.substr(0, eq); - std::string val = part.substr(eq + 1); - if (key == "chunk_size") - p.chunk_size = parse_nbytes(val); - else if (key == "delta") - p.cascaded_delta = std::stoi(val); - else if (key == "rle") - p.cascaded_rle = std::stoi(val); - else if (key == "bitpack") - p.cascaded_bitpack = std::stoi(val); - } - if (comma == std::string::npos) - break; - start = comma + 1; - } - return p; -} - -struct PhaseThroughputs { - double compress_Bps{0.0}; - double decompress_Bps{0.0}; - double comp_send_Bps{0.0}; - double recv_decomp_Bps{0.0}; - double send_only_Bps{0.0}; - double recv_only_Bps{0.0}; -}; - -static inline void ensure_ready(Buffer& buf) { - if (!buf.is_latest_write_done()) { - buf.stream().synchronize(); - } -} - -static inline std::unique_ptr alloc_device( - BufferResource* br, rmm::cuda_stream_view stream, std::size_t size -) { - auto res = br->reserve_or_fail(size, MemoryType::DEVICE); - return br->allocate(size, stream, res); -} - -static inline std::unique_ptr alloc_and_copy_device( - BufferResource* br, rmm::cuda_stream_view stream, Buffer const& src -) { - auto out = alloc_device(br, stream, src.size); - buffer_copy(*out, src, src.size); - return out; -} - -// Non-blocking helpers to exchange headers and payloads concurrently. -static inline std::uint64_t exchange_u64_header( - std::shared_ptr const& comm, - BufferResource* br, - rmm::cuda_stream_view stream, - Rank peer, - Tag send_tag, - Tag recv_tag, - std::uint64_t send_value -) { - // Post header send - auto send_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto send_hdr = br->allocate(sizeof(std::uint64_t), stream, send_hdr_res); - send_hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { - std::memcpy(p, &send_value, sizeof(std::uint64_t)); - }); - ensure_ready(*send_hdr); - auto send_hdr_fut = comm->send(std::move(send_hdr), peer, send_tag); - // Post header recv - auto recv_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); - auto recv_hdr = br->allocate(sizeof(std::uint64_t), stream, recv_hdr_res); - ensure_ready(*recv_hdr); - auto recv_hdr_fut = comm->recv(peer, recv_tag, std::move(recv_hdr)); - // Wait recv, read value, then ensure send completion - auto recv_hdr_buf = comm->wait(std::move(recv_hdr_fut)); - std::uint64_t recv_value = 0; - { - auto* p = recv_hdr_buf->exclusive_data_access(); - std::memcpy(&recv_value, p, sizeof(std::uint64_t)); - recv_hdr_buf->unlock(); - } - std::ignore = comm->wait(std::move(send_hdr_fut)); - return recv_value; -} - -static inline void exchange_payload( - std::shared_ptr const& comm, - BufferResource* br, - rmm::cuda_stream_view stream, - Rank peer, - Tag send_tag, - Tag recv_tag, - std::unique_ptr send_buf, // may be null if no data to send - std::size_t recv_size // may be zero if no data to recv -) { - std::unique_ptr data_send_fut; - std::unique_ptr data_recv_fut; - if (recv_size > 0) { - auto recv_buf = alloc_device(br, stream, recv_size); - ensure_ready(*recv_buf); - data_recv_fut = comm->recv(peer, recv_tag, std::move(recv_buf)); - } - if (send_buf && send_buf->size > 0) { - ensure_ready(*send_buf); - data_send_fut = comm->send(std::move(send_buf), peer, send_tag); - } - if (data_recv_fut) { - (void)comm->wait(std::move(data_recv_fut)); - } - if (data_send_fut) { - std::ignore = comm->wait(std::move(data_send_fut)); - } -} - -// Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData -static std::unique_ptr pack_table_to_packed( - cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br -) { - auto packed = cudf::pack(tv, stream, br->device_mr()); - auto ret = std::make_unique( - std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream) - ); - return ret; -} - struct ArgumentParser { ArgumentParser(int argc, char* const* argv, bool use_mpi) { int rank = 0; @@ -378,6 +197,68 @@ struct ArgumentParser { return args_; } + static std::size_t parse_nbytes(std::string const& s) { + // Simple parser: supports suffixes KiB, MiB, GiB, KB, MB, GB, or no suffix. + auto to_lower = [](char c) { return static_cast(std::tolower(c)); }; + std::string v; + v.reserve(s.size()); + for (char c : s) + v.push_back(to_lower(c)); + + std::size_t mult = 1; + if (v.ends_with("kib")) { + mult = 1ull << 10; + v = v.substr(0, v.size() - 3); + } else if (v.ends_with("mib")) { + mult = 1ull << 20; + v = v.substr(0, v.size() - 3); + } else if (v.ends_with("gib")) { + mult = 1ull << 30; + v = v.substr(0, v.size() - 3); + } else if (v.ends_with("kb")) { + mult = 1000ull; + v = v.substr(0, v.size() - 2); + } else if (v.ends_with("mb")) { + mult = 1000ull * 1000ull; + v = v.substr(0, v.size() - 2); + } else if (v.ends_with("gb")) { + mult = 1000ull * 1000ull * 1000ull; + v = v.substr(0, v.size() - 2); + } + + return static_cast(std::stoll(v)) * mult; + } + + static KvParams parse_kv_params(std::string const& kv) { + KvParams p{}; + if (kv.empty()) + return p; + std::size_t start = 0; + while (start < kv.size()) { + auto comma = kv.find(',', start); + auto part = kv.substr( + start, comma == std::string::npos ? std::string::npos : comma - start + ); + auto eq = part.find('='); + if (eq != std::string::npos) { + std::string key = part.substr(0, eq); + std::string val = part.substr(eq + 1); + if (key == "chunk_size") + p.chunk_size = parse_nbytes(val); + else if (key == "delta") + p.cascaded_delta = std::stoi(val); + else if (key == "rle") + p.cascaded_rle = std::stoi(val); + else if (key == "bitpack") + p.cascaded_bitpack = std::stoi(val); + } + if (comma == std::string::npos) + break; + start = comma + 1; + } + return p; + } + private: Args args_{}; }; @@ -396,6 +277,54 @@ struct BuffersToSend { std::size_t total_payload_bytes{0}; }; +struct Timings { + double pack_s{0.0}; + double compress_s{0.0}; + double decompress_s{0.0}; + // Round-trip totals measured at initiator + double rt_nocomp_s{0.0}; + double rt_comp_s{0.0}; +}; + +struct Counters { + std::size_t logical_uncompressed_bytes{0}; + std::size_t logical_compressed_bytes{0}; +}; + +struct RunResult { + Timings times; + Counters counts; +}; + +std::vector expand_glob(std::string const& pattern) { + std::vector files; + glob_t glob_result{}; + int rc = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result); + if (rc == 0) { + for (std::size_t i = 0; i < glob_result.gl_pathc; ++i) { + files.emplace_back(glob_result.gl_pathv[i]); + } + } + globfree(&glob_result); + return files; +} + +static inline void ensure_ready(Buffer& buf) { + if (!buf.is_latest_write_done()) { + buf.stream().synchronize(); + } +} + +static std::unique_ptr pack_table_to_packed( + cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br +) { + auto packed = cudf::pack(tv, stream, br->device_mr()); + auto ret = std::make_unique( + std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream) + ); + return ret; +} + BuffersToSend make_packed_items( cudf::table const& table, PackMode mode, @@ -406,7 +335,6 @@ BuffersToSend make_packed_items( if (mode == PackMode::Table) { auto item = PackedItem{}; item.packed = pack_table_to_packed(table.view(), stream, br); - // Ensure the pack is completed before using the buffer downstream ensure_ready(*item.packed->data); ret.total_uncompressed_bytes += item.packed->data->size; ret.total_payload_bytes += item.packed->data->size; @@ -421,7 +349,6 @@ BuffersToSend make_packed_items( ret.total_payload_bytes += item.packed->data->size; ret.items.emplace_back(std::move(item)); } - // Synchronize all packs after launching them to avoid per-iteration blocking for (std::size_t i = 0; i < ret.items.size(); ++i) { ensure_ready(*ret.items[i].packed->data); } @@ -429,43 +356,94 @@ BuffersToSend make_packed_items( return ret; } -// Send/recv helpers: send a header (compressed size) as host buffer. -struct SizeHeader { - std::uint64_t bytes; -}; +static inline std::unique_ptr alloc_device( + BufferResource* br, rmm::cuda_stream_view stream, std::size_t size +) { + auto res = br->reserve_or_fail(size, MemoryType::DEVICE); + return br->allocate(size, stream, res); +} -struct Timings { - double pack_s{0.0}; - double compress_s{0.0}; - double decompress_s{0.0}; - // Round-trip totals measured at initiator - double rt_nocomp_s{0.0}; - double rt_comp_s{0.0}; -}; +static inline std::unique_ptr alloc_and_copy_device( + BufferResource* br, rmm::cuda_stream_view stream, Buffer const& src +) { + auto out = alloc_device(br, stream, src.size); + buffer_copy(*out, src, src.size); + return out; +} -// Returns timings and bytes counters -struct Counters { - std::size_t logical_uncompressed_bytes{0}; - std::size_t logical_compressed_bytes{0}; -}; +static inline std::uint64_t exchange_u64_header( + std::shared_ptr const& comm, + BufferResource* br, + rmm::cuda_stream_view stream, + Rank peer, + Tag send_tag, + Tag recv_tag, + std::uint64_t send_value +) { + // Post header send + auto send_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto send_hdr = br->allocate(sizeof(std::uint64_t), stream, send_hdr_res); + send_hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) { + std::memcpy(p, &send_value, sizeof(std::uint64_t)); + }); + ensure_ready(*send_hdr); + auto send_hdr_fut = comm->send(std::move(send_hdr), peer, send_tag); + // Post header recv + auto recv_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST); + auto recv_hdr = br->allocate(sizeof(std::uint64_t), stream, recv_hdr_res); + ensure_ready(*recv_hdr); + auto recv_hdr_fut = comm->recv(peer, recv_tag, std::move(recv_hdr)); + // Wait recv, read value, then ensure send completion + auto recv_hdr_buf = comm->wait(std::move(recv_hdr_fut)); + std::uint64_t recv_value = 0; + { + auto* p = recv_hdr_buf->exclusive_data_access(); + std::memcpy(&recv_value, p, sizeof(std::uint64_t)); + recv_hdr_buf->unlock(); + } + std::ignore = comm->wait(std::move(send_hdr_fut)); + return recv_value; +} -struct RunResult { - Timings times; - Counters counts; -}; +static inline void exchange_payload( + std::shared_ptr const& comm, + BufferResource* br, + rmm::cuda_stream_view stream, + Rank peer, + Tag send_tag, + Tag recv_tag, + std::unique_ptr send_buf, + std::size_t recv_size +) { + std::unique_ptr data_send_fut; + std::unique_ptr data_recv_fut; + if (recv_size > 0) { + auto recv_buf = alloc_device(br, stream, recv_size); + ensure_ready(*recv_buf); + data_recv_fut = comm->recv(peer, recv_tag, std::move(recv_buf)); + } + if (send_buf && send_buf->size > 0) { + ensure_ready(*send_buf); + data_send_fut = comm->send(std::move(send_buf), peer, send_tag); + } + if (data_recv_fut) { + std::ignore = comm->wait(std::move(data_recv_fut)); + } + if (data_send_fut) { + std::ignore = comm->wait(std::move(data_send_fut)); + } +} RunResult run_once( std::shared_ptr const& comm, Args const& args, rmm::cuda_stream_view stream, BufferResource* br, - std::shared_ptr const& statistics, cudf::table const& table, PackMode pack_mode, NvcompCodec& codec, std::uint64_t run_index ) { - (void)statistics; auto const nranks = comm->nranks(); auto const rank = comm->rank(); auto const dst = static_cast((rank + 1) % nranks); @@ -521,7 +499,6 @@ RunResult run_once( [&codec, &packed, i, in_bytes, &comp_output_sizes, &comp_streams, br]( std::byte* out_ptr, rmm::cuda_stream_view out_stream ) { - (void)out_ptr; // pointer used below // Lock input for raw pointer access auto* in_raw = packed.items[i].packed->data->exclusive_data_access(); codec.compress( @@ -532,7 +509,6 @@ RunResult run_once( out_stream, br ); - // Defer synchronization and unlock; record stream for later sync comp_streams[i] = out_stream; } ); @@ -548,8 +524,7 @@ RunResult run_once( } auto t1 = Clock::now(); - // Phase A (RTT no compression): ping-pong per op (sequential per item to avoid - // deadlocks) + // Phase A (RTT no compression): ping-pong per op (sequential per item) Duration rt_nc_total{0}; for (std::uint64_t op = 0; op < args.num_ops; ++op) { bool initiator = @@ -559,7 +534,7 @@ RunResult run_once( Tag send_tag_nc = initiator ? tag_ping_nc : tag_pong_nc; Tag recv_tag_nc = initiator ? tag_pong_nc : tag_ping_nc; for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) { - // Exchange payload sizes to avoid assuming symmetric sizes across ranks + // Exchange payload sizes std::uint64_t local_sz = static_cast(nocomp_payloads[i]->size); std::uint64_t remote_sz = exchange_u64_header( comm, br, stream, peer, send_tag_nc, recv_tag_nc, local_sz @@ -622,8 +597,7 @@ RunResult run_once( rt_c_total += (rt_end - rt_start); } - // Decompress received buffers (simulate by decompressing our own produced outputs in - // symmetric setup) + // Decompress received buffers (simulate by decompressing our own produced outputs) auto c0 = Clock::now(); std::vector decomp_streams(packed.items.size()); for (std::size_t i = 0; i < packed.items.size(); ++i) { @@ -645,7 +619,6 @@ RunResult run_once( out_bytes, out_stream ); - // Defer unlock until after per-stream synchronization decomp_streams[i] = out_stream; } ); @@ -668,7 +641,7 @@ RunResult run_once( result.times.rt_comp_s = rt_c_total.count(); result.times.decompress_s = std::chrono::duration(c1 - c0).count(); - // Use payload (device) bytes as the logical uncompressed size for throughput + // Use payload bytes as the logical uncompressed size for throughput result.counts.logical_uncompressed_bytes = packed.total_payload_bytes * args.num_ops; result.counts.logical_compressed_bytes = std::accumulate( @@ -682,10 +655,13 @@ RunResult run_once( int main(int argc, char** argv) { // Check if we should use bootstrap mode with rrun + // This is determined by checking for RAPIDSMPF_RANK environment variable bool use_bootstrap = std::getenv("RAPIDSMPF_RANK") != nullptr; int provided = 0; if (!use_bootstrap) { + // Explicitly initialize MPI with thread support, as this is needed for both mpi + // and ucxx communicators. RAPIDSMPF_MPI(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided)); RAPIDSMPF_EXPECTS( provided == MPI_THREAD_MULTIPLE, @@ -711,10 +687,12 @@ int main(int argc, char** argv) { comm = std::make_shared(MPI_COMM_WORLD, options); } else if (args.comm_type == "ucxx") { if (use_bootstrap) { + // Launched with rrun - use bootstrap backend comm = rapidsmpf::bootstrap::create_ucxx_comm( rapidsmpf::bootstrap::Backend::AUTO, options ); } else { + // Launched with mpirun - use MPI bootstrap comm = rapidsmpf::ucxx::init_using_mpi(MPI_COMM_WORLD, options); } } else { @@ -725,12 +703,11 @@ int main(int argc, char** argv) { auto& log = comm->logger(); rmm::cuda_stream_view stream = cudf::get_default_stream(); - // RMM setup auto const mr_stack = set_current_rmm_stack(args.rmm_mr); rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); BufferResource br{mr}; - // Hardware info + // Print benchmark/hardware info. { std::stringstream ss; auto const cur_dev = rmm::get_current_cuda_device().value(); @@ -749,9 +726,8 @@ int main(int argc, char** argv) { log.print(ss.str()); } - // Stats and CUPTI - auto stats = std::make_shared(/* enable = */ false); #ifdef RAPIDSMPF_HAVE_CUPTI + // Create CUPTI monitor if enabled std::unique_ptr cupti_monitor; if (args.enable_cupti_monitoring) { cupti_monitor = std::make_unique(); @@ -776,16 +752,13 @@ int main(int argc, char** argv) { ); log.print("Rank " + std::to_string(comm->rank()) + " reading: " + my_file); - // Read Parquet into cudf::table cudf::io::parquet_reader_options reader_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{my_file}); auto table_with_md = cudf::io::read_parquet(reader_opts); auto& table = table_with_md.tbl; - // Prepare codec auto codec = make_codec(args.algo, args.params); - // Runs std::vector pack_t, compress_t, decompress_t, rt_nc_t, rt_c_t; pack_t.reserve(args.num_runs); compress_t.reserve(args.num_runs); @@ -797,11 +770,7 @@ int main(int argc, char** argv) { std::size_t logical_compressed_bytes_last = 0; for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) { - if (i == args.num_warmups + args.num_runs - 1) { - stats = std::make_shared(/* enable = */ true); - } - auto rr = - run_once(comm, args, stream, &br, stats, *table, args.pack_mode, *codec, i); + auto rr = run_once(comm, args, stream, &br, *table, args.pack_mode, *codec, i); logical_bytes = rr.counts.logical_uncompressed_bytes; double pBps = @@ -854,7 +823,6 @@ int main(int argc, char** argv) { } } - // Means auto harmonic_mean = [](std::vector const& v) { double denom_sum = 0.0; for (auto x : v) From c46305ff5e193b7b4c60faf16b4eb03aac39d987 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 12 Nov 2025 12:16:37 -0800 Subject: [PATCH 26/26] Support (de-)compressing and transferring data only --- cpp/benchmarks/bench_comp_comm.cpp | 106 +++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp index 64e5dcb40..7fd1af243 100644 --- a/cpp/benchmarks/bench_comp_comm.cpp +++ b/cpp/benchmarks/bench_comp_comm.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -69,6 +70,7 @@ struct Args { std::uint64_t num_ops{1}; bool enable_cupti_monitoring{false}; std::string cupti_csv_prefix; + bool data_only{false}; }; struct ArgumentParser { @@ -81,8 +83,8 @@ struct ArgumentParser { try { int opt; // C: comm, r: runs, w: warmups, m: rmm, F: files, P: pack mode, A: algo, K: - // kv, p: ops, M: cupti, h: help - while ((opt = getopt(argc, argv, "C:r:w:m:F:P:A:K:p:M:h")) != -1) { + // kv, p: ops, M: cupti, D: data-only, h: help + while ((opt = getopt(argc, argv, "C:r:w:m:F:P:A:K:p:M:Dh")) != -1) { switch (opt) { case 'C': args_.comm_type = std::string{optarg}; @@ -140,6 +142,9 @@ struct ArgumentParser { args_.enable_cupti_monitoring = true; args_.cupti_csv_prefix = std::string{optarg}; break; + case 'D': + args_.data_only = true; + break; case 'h': default: { @@ -160,6 +165,8 @@ struct ArgumentParser { "chunk_size=1MiB,delta=1,rle=1,bitpack=1\n" << " -p Number of concurrent ops (default: 1)\n" << " -M CUPTI CSV path prefix (enable CUPTI)\n" + << " -D Data-only mode (compress/transfer data " + "buffers only)\n" << " -h Show this help\n"; if (rank == 0) std::cerr << ss.str(); @@ -266,6 +273,8 @@ struct ArgumentParser { struct PackedItem { // Ownership: we store size and buffer pointer for the packed payload std::unique_ptr packed; // original packed cudf table/column + // Data-only mode: directly owned GPU buffer containing column/table data + std::unique_ptr raw_data; }; struct BuffersToSend { @@ -277,6 +286,14 @@ struct BuffersToSend { std::size_t total_payload_bytes{0}; }; +static inline Buffer& item_data_buffer(PackedItem& it) { + return it.packed ? *it.packed->data : *it.raw_data; +} + +static inline Buffer const& item_data_buffer(PackedItem const& it) { + return it.packed ? *it.packed->data : *it.raw_data; +} + struct Timings { double pack_s{0.0}; double compress_s{0.0}; @@ -356,6 +373,67 @@ BuffersToSend make_packed_items( return ret; } +// Collect leaf data buffers recursively (device memory ranges) from a column. +static void collect_leaf_data_buffers( + cudf::column_view const& col, + std::vector>& buffers +) { + for (auto it = col.child_begin(); it != col.child_end(); ++it) { + collect_leaf_data_buffers(*it, buffers); + } + if (col.num_children() == 0 && col.size() > 0) { + auto const elem_size = static_cast(cudf::size_of(col.type())); + if (elem_size == 0) { + return; + } + auto const* base = col.head(); + if (base == nullptr) { + return; + } + auto const byte_offset = static_cast(col.offset()) * elem_size; + auto const num_bytes = static_cast(col.size()) * elem_size; + buffers.emplace_back(static_cast(base + byte_offset), num_bytes); + } +} + +static std::vector> collect_table_data_buffers( + cudf::table_view const& tv +) { + std::vector> out; + for (auto const& col : tv) { + collect_leaf_data_buffers(col, out); + } + return out; +} + +// Data-only path: build items consisting solely of copies of each leaf device buffer. +BuffersToSend make_data_only_items( + cudf::table const& table, rmm::cuda_stream_view stream, BufferResource* br +) { + BuffersToSend ret{}; + auto leaves = collect_table_data_buffers(table.view()); + ret.items.reserve(leaves.size()); + for (auto const& [src_ptr, nbytes] : leaves) { + if (nbytes == 0) { + continue; + } + auto reservation = br->reserve_or_fail(nbytes, MemoryType::DEVICE); + auto buf = br->allocate(nbytes, stream, reservation); + buf->write_access([&](std::byte* dst, rmm::cuda_stream_view s) { + RAPIDSMPF_CUDA_TRY_ALLOC( + cudaMemcpyAsync(dst, src_ptr, nbytes, cudaMemcpyDefault, s) + ); + }); + ensure_ready(*buf); + PackedItem item{}; + item.raw_data = std::move(buf); + ret.total_uncompressed_bytes += nbytes; + ret.total_payload_bytes += nbytes; + ret.items.emplace_back(std::move(item)); + } + return ret; +} + static inline std::unique_ptr alloc_device( BufferResource* br, rmm::cuda_stream_view stream, std::size_t size ) { @@ -456,7 +534,8 @@ RunResult run_once( // Pack data per iteration auto p0 = Clock::now(); - auto packed = make_packed_items(table, pack_mode, stream, br); + auto packed = args.data_only ? make_data_only_items(table, stream, br) + : make_packed_items(table, pack_mode, stream, br); auto p1 = Clock::now(); // Clone packed items into raw device buffers for repeated ops @@ -466,9 +545,10 @@ RunResult run_once( // Copy metadata + data into a contiguous device buffer for pure send path? // For pure send/recv, we only send the device payload; metadata isn't needed for // metrics. We'll send the packed->data buffer. - auto reservation = br->reserve_or_fail(it.packed->data->size, MemoryType::DEVICE); - auto buf = br->allocate(it.packed->data->size, stream, reservation); - buffer_copy(*buf, *it.packed->data, it.packed->data->size); + auto const& src_buf = item_data_buffer(it); + auto reservation = br->reserve_or_fail(src_buf.size, MemoryType::DEVICE); + auto buf = br->allocate(src_buf.size, stream, reservation); + buffer_copy(*buf, src_buf, src_buf.size); nocomp_payloads.emplace_back(std::move(buf)); } @@ -477,7 +557,7 @@ RunResult run_once( std::vector comp_output_sizes(packed.items.size()); comp_outputs.reserve(packed.items.size()); for (std::size_t i = 0; i < packed.items.size(); ++i) { - auto const in_bytes = packed.items[i].packed->data->size; + auto const in_bytes = item_data_buffer(packed.items[i]).size; std::size_t const max_out = (in_bytes == 0) ? 1 : codec.get_max_compressed_bytes(in_bytes, stream); auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE); @@ -489,7 +569,7 @@ RunResult run_once( // Compress all items (single batch) on stream std::vector comp_streams(packed.items.size()); for (std::size_t i = 0; i < packed.items.size(); ++i) { - auto const in_bytes = packed.items[i].packed->data->size; + auto const in_bytes = item_data_buffer(packed.items[i]).size; if (in_bytes == 0) { comp_output_sizes[i] = 0; continue; @@ -500,7 +580,7 @@ RunResult run_once( std::byte* out_ptr, rmm::cuda_stream_view out_stream ) { // Lock input for raw pointer access - auto* in_raw = packed.items[i].packed->data->exclusive_data_access(); + auto* in_raw = item_data_buffer(packed.items[i]).exclusive_data_access(); codec.compress( static_cast(in_raw), in_bytes, @@ -515,12 +595,12 @@ RunResult run_once( } // Synchronize streams and unlock inputs for (std::size_t i = 0; i < packed.items.size(); ++i) { - auto const in_bytes = packed.items[i].packed->data->size; + auto const in_bytes = item_data_buffer(packed.items[i]).size; if (in_bytes == 0) { continue; } RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(comp_streams[i].value())); - packed.items[i].packed->data->unlock(); + item_data_buffer(packed.items[i]).unlock(); } auto t1 = Clock::now(); @@ -601,7 +681,7 @@ RunResult run_once( auto c0 = Clock::now(); std::vector decomp_streams(packed.items.size()); for (std::size_t i = 0; i < packed.items.size(); ++i) { - auto const out_bytes = packed.items[i].packed->data->size; + auto const out_bytes = item_data_buffer(packed.items[i]).size; if (out_bytes == 0) { continue; } @@ -625,7 +705,7 @@ RunResult run_once( } // Synchronize each decomp stream and then unlock the corresponding input for (std::size_t i = 0; i < packed.items.size(); ++i) { - auto const out_bytes = packed.items[i].packed->data->size; + auto const out_bytes = item_data_buffer(packed.items[i]).size; if (out_bytes == 0) { continue; }