From 0f3dbc8b2ae09506f554e29ce8a27c27ffddf34c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 6 Nov 2025 07:36:39 -0800
Subject: [PATCH 01/26] Add nvCOMP benchmark

---
 cpp/benchmarks/CMakeLists.txt      |  30 +
 cpp/benchmarks/bench_comp_comm.cpp | 916 +++++++++++++++++++++++++++++
 2 files changed, 946 insertions(+)
 create mode 100644 cpp/benchmarks/bench_comp_comm.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index da5bd7be3..01ec5528a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -61,6 +61,36 @@ install(
   EXCLUDE_FROM_ALL
 )
 
+# ----------------------------------------------------------------------------
+# nvCOMP compression + Communicator benchmark
+find_package(nvcomp CONFIG REQUIRED)
+add_executable(bench_comp_comm "bench_comp_comm.cpp")
+set_target_properties(
+  bench_comp_comm
+  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${RAPIDSMPF_BINARY_DIR}/benchmarks>"
+             CXX_STANDARD 20
+             CXX_STANDARD_REQUIRED ON
+             # For std:: support of __int128_t. Can be removed once using cuda::std
+             CXX_EXTENSIONS ON
+             CUDA_STANDARD 20
+             CUDA_STANDARD_REQUIRED ON
+             LINK_FLAGS "-Wl,--allow-shlib-undefined"
+)
+target_compile_options(
+  bench_comp_comm PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAPIDSMPF_CXX_FLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAPIDSMPF_CUDA_FLAGS}>"
+)
+target_link_libraries(
+  bench_comp_comm PRIVATE rapidsmpf::rapidsmpf ucxx::ucxx $<TARGET_NAME_IF_EXISTS:MPI::MPI_C>
+                          nvcomp::nvcomp $<TARGET_NAME_IF_EXISTS:conda_env> maybe_asan bench_utils
+)
+install(
+  TARGETS bench_comp_comm
+  COMPONENT benchmarking
+  DESTINATION bin/benchmarks/librapidsmpf
+  EXCLUDE_FROM_ALL
+)
+
 # Find or install GoogleBench
 include(${rapids-cmake-dir}/cpm/gbench.cmake)
 rapids_cpm_gbench(BUILD_STATIC)
diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
new file mode 100644
index 000000000..6cc557f32
--- /dev/null
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -0,0 +1,916 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <glob.h>
+#include <mpi.h>
+#include <unistd.h>
+
+#include <cudf/contiguous_split.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+#include <rapidsmpf/bootstrap/bootstrap.hpp>
+#include <rapidsmpf/bootstrap/ucxx.hpp>
+#include <rapidsmpf/buffer/buffer.hpp>
+#include <rapidsmpf/buffer/packed_data.hpp>
+#include <rapidsmpf/buffer/resource.hpp>
+#include <rapidsmpf/communicator/communicator.hpp>
+#include <rapidsmpf/communicator/mpi.hpp>
+#include <rapidsmpf/communicator/ucxx_utils.hpp>
+#include <rapidsmpf/config.hpp>
+#include <rapidsmpf/error.hpp>
+#include <rapidsmpf/statistics.hpp>
+#include <rapidsmpf/utils.hpp>
+
+#ifdef RAPIDSMPF_HAVE_CUPTI
+#include <rapidsmpf/cupti.hpp>
+#endif
+
+#include "utils/misc.hpp"
+#include "utils/rmm_stack.hpp"
+
+// nvCOMP managers (v3.x API)
+#include <nvcomp/cascaded.hpp>
+#include <nvcomp/lz4.hpp>
+
+using namespace rapidsmpf;
+
+namespace {
+
+enum class PackMode {
+    Table,
+    Columns
+};
+enum class Algo {
+    Cascaded,
+    LZ4
+};
+
+struct KvParams {
+    // Common
+    std::size_t chunk_size{1 << 20};
+    // Cascaded
+    int cascaded_rle{1};
+    int cascaded_delta{1};
+    int cascaded_bitpack{1};
+};
+
+struct Args {
+    std::string comm_type{"mpi"};
+    std::uint64_t num_runs{1};
+    std::uint64_t num_warmups{0};
+    std::string rmm_mr{"pool"};
+    std::string file_pattern;  // required
+    PackMode pack_mode{PackMode::Table};
+    Algo algo{Algo::Cascaded};
+    KvParams params{};
+    std::uint64_t num_ops{1};
+    bool enable_cupti_monitoring{false};
+    std::string cupti_csv_prefix;
+};
+
+std::vector<std::string> expand_glob(std::string const& pattern) {
+    std::vector<std::string> files;
+    glob_t glob_result{};
+    int rc = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result);
+    if (rc == 0) {
+        for (std::size_t i = 0; i < glob_result.gl_pathc; ++i) {
+            files.emplace_back(glob_result.gl_pathv[i]);
+        }
+    }
+    globfree(&glob_result);
+    std::sort(files.begin(), files.end());
+    return files;
+}
+
+std::size_t parse_nbytes(std::string const& s) {
+    // Simple parser: supports suffixes KiB, MiB, GiB, KB, MB, GB, or no suffix.
+    auto to_lower = [](char c) { return static_cast<char>(std::tolower(c)); };
+    std::string v;
+    v.reserve(s.size());
+    for (char c : s)
+        v.push_back(to_lower(c));
+
+    std::size_t mult = 1;
+    if (v.ends_with("kib")) {
+        mult = 1ull << 10;
+        v = v.substr(0, v.size() - 3);
+    } else if (v.ends_with("mib")) {
+        mult = 1ull << 20;
+        v = v.substr(0, v.size() - 3);
+    } else if (v.ends_with("gib")) {
+        mult = 1ull << 30;
+        v = v.substr(0, v.size() - 3);
+    } else if (v.ends_with("kb")) {
+        mult = 1000ull;
+        v = v.substr(0, v.size() - 2);
+    } else if (v.ends_with("mb")) {
+        mult = 1000ull * 1000ull;
+        v = v.substr(0, v.size() - 2);
+    } else if (v.ends_with("gb")) {
+        mult = 1000ull * 1000ull * 1000ull;
+        v = v.substr(0, v.size() - 2);
+    }
+
+    return static_cast<std::size_t>(std::stoll(v)) * mult;
+}
+
+KvParams parse_kv_params(std::string const& kv) {
+    KvParams p{};
+    if (kv.empty())
+        return p;
+    std::size_t start = 0;
+    while (start < kv.size()) {
+        auto comma = kv.find(',', start);
+        auto part = kv.substr(
+            start, comma == std::string::npos ? std::string::npos : comma - start
+        );
+        auto eq = part.find('=');
+        if (eq != std::string::npos) {
+            std::string key = part.substr(0, eq);
+            std::string val = part.substr(eq + 1);
+            if (key == "chunk_size")
+                p.chunk_size = parse_nbytes(val);
+            else if (key == "delta")
+                p.cascaded_delta = std::stoi(val);
+            else if (key == "rle")
+                p.cascaded_rle = std::stoi(val);
+            else if (key == "bitpack")
+                p.cascaded_bitpack = std::stoi(val);
+        }
+        if (comma == std::string::npos)
+            break;
+        start = comma + 1;
+    }
+    return p;
+}
+
+struct PhaseThroughputs {
+    double compress_Bps{0.0};
+    double decompress_Bps{0.0};
+    double comp_send_Bps{0.0};
+    double recv_decomp_Bps{0.0};
+    double send_only_Bps{0.0};
+    double recv_only_Bps{0.0};
+};
+
+class NvcompCodec {
+  public:
+    virtual ~NvcompCodec() = default;
+    virtual std::size_t get_max_compressed_bytes(std::size_t uncompressed_bytes) = 0;
+    virtual void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) = 0;
+    virtual void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) = 0;
+};
+
+class LZ4Codec final : public NvcompCodec {
+  public:
+    explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
+
+    std::size_t get_max_compressed_bytes(std::size_t in_bytes) override {
+        nvcomp::LZ4Manager mgr{static_cast<int>(chunk_size_), 0};
+        auto cfg = mgr.configure_compression(in_bytes);
+        return cfg.max_compressed_buffer_size;
+    }
+
+    void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::LZ4Manager mgr{static_cast<int>(chunk_size_), 0, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        mgr.compress(d_out, d_in, cfg);
+        // Compressed size is stored at the beginning of output; ask manager
+        auto info = mgr.get_compress_result(d_out);
+        *out_bytes = info.compressed_bytes;
+    }
+
+    void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::LZ4Manager mgr{static_cast<int>(chunk_size_), 0, stream.value()};
+        auto cfg = mgr.configure_decompression(d_in, in_bytes);
+        (void)out_bytes;  // decomp size implied by cfg
+        mgr.decompress(d_out, d_in, cfg);
+    }
+
+  private:
+    std::size_t chunk_size_;
+};
+
+class CascadedCodec final : public NvcompCodec {
+  public:
+    CascadedCodec(std::size_t chunk_size, int rle, int delta, int bitpack)
+        : opts_{rle != 0, delta != 0, bitpack != 0, static_cast<int>(chunk_size)} {}
+
+    std::size_t get_max_compressed_bytes(std::size_t in_bytes) override {
+        nvcomp::CascadedManager mgr{opts_};
+        auto cfg = mgr.configure_compression(in_bytes);
+        return cfg.max_compressed_buffer_size;
+    }
+
+    void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::CascadedManager mgr{opts_, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        mgr.compress(d_out, d_in, cfg);
+        auto info = mgr.get_compress_result(d_out);
+        *out_bytes = info.compressed_bytes;
+    }
+
+    void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::CascadedManager mgr{opts_, stream.value()};
+        auto cfg = mgr.configure_decompression(d_in, in_bytes);
+        (void)out_bytes;
+        mgr.decompress(d_out, d_in, cfg);
+    }
+
+  private:
+    nvcomp::CascadedOptions opts_{};
+};
+
+std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p) {
+    switch (algo) {
+    case Algo::LZ4:
+        return std::make_unique<LZ4Codec>(p.chunk_size);
+    case Algo::Cascaded:
+    default:
+        return std::make_unique<CascadedCodec>(
+            p.chunk_size, p.cascaded_rle, p.cascaded_delta, p.cascaded_bitpack
+        );
+    }
+}
+
+// Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData
+static std::unique_ptr<PackedData> pack_table_to_packed(
+    cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br
+) {
+    auto packed = cudf::pack(tv, stream, br->device_mr());
+    auto metadata =
+        std::make_unique<std::vector<std::uint8_t>>(std::move(packed.metadata));
+    auto buf = br->move(
+        std::make_unique<rmm::device_buffer>(std::move(packed.gpu_data)), stream
+    );
+    return std::make_unique<PackedData>(std::move(metadata), std::move(buf));
+}
+
+struct ArgumentParser {
+    ArgumentParser(int argc, char* const* argv, bool use_mpi) {
+        int rank = 0;
+        if (use_mpi) {
+            RAPIDSMPF_EXPECTS(mpi::is_initialized() == true, "MPI is not initialized");
+            RAPIDSMPF_MPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+        }
+        try {
+            int opt;
+            // C: comm, r: runs, w: warmups, m: rmm, F: files, P: pack mode, A: algo, K:
+            // kv, p: ops, M: cupti, h: help
+            while ((opt = getopt(argc, argv, "C:r:w:m:F:P:A:K:p:M:h")) != -1) {
+                switch (opt) {
+                case 'C':
+                    args_.comm_type = std::string{optarg};
+                    break;
+                case 'r':
+                    parse_integer(args_.num_runs, optarg);
+                    break;
+                case 'w':
+                    parse_integer(args_.num_warmups, optarg);
+                    break;
+                case 'm':
+                    args_.rmm_mr = std::string{optarg};
+                    break;
+                case 'F':
+                    args_.file_pattern = std::string{optarg};
+                    break;
+                case 'P':
+                    {
+                        std::string v{optarg};
+                        if (v == "table")
+                            args_.pack_mode = PackMode::Table;
+                        else if (v == "columns")
+                            args_.pack_mode = PackMode::Columns;
+                        else
+                            RAPIDSMPF_FAIL(
+                                "-P must be one of {table, columns}",
+                                std::invalid_argument
+                            );
+                        break;
+                    }
+                case 'A':
+                    {
+                        std::string v{optarg};
+                        if (v == "cascaded")
+                            args_.algo = Algo::Cascaded;
+                        else if (v == "lz4")
+                            args_.algo = Algo::LZ4;
+                        else
+                            RAPIDSMPF_FAIL(
+                                "-A must be one of {cascaded, lz4}", std::invalid_argument
+                            );
+                        break;
+                    }
+                case 'K':
+                    args_.params = parse_kv_params(std::string{optarg});
+                    break;
+                case 'p':
+                    parse_integer(args_.num_ops, optarg);
+                    break;
+                case 'M':
+                    args_.enable_cupti_monitoring = true;
+                    args_.cupti_csv_prefix = std::string{optarg};
+                    break;
+                case 'h':
+                default:
+                    {
+                        std::stringstream ss;
+                        ss << "Usage: " << argv[0] << " [options]\n"
+                           << "Options:\n"
+                           << "  -C <comm>    {mpi, ucxx} (default: mpi)\n"
+                           << "  -r <num>     Number of runs (default: 1)\n"
+                           << "  -w <num>     Number of warmup runs (default: 0)\n"
+                           << "  -m <mr>      RMM MR {cuda, pool, async, managed} "
+                              "(default: pool)\n"
+                           << "  -F <pattern> Parquet file glob/pattern (required)\n"
+                           << "  -P <mode>    Packing mode {table, columns} (default: "
+                              "table)\n"
+                           << "  -A <algo>    {cascaded, lz4} (default: cascaded)\n"
+                           << "  -K <kv>      Algo params, e.g. "
+                              "chunk_size=1MiB,delta=1,rle=1,bitpack=1\n"
+                           << "  -p <num>     Number of concurrent ops (default: 1)\n"
+                           << "  -M <path>    CUPTI CSV path prefix (enable CUPTI)\n"
+                           << "  -h           Show this help\n";
+                        if (rank == 0)
+                            std::cerr << ss.str();
+                        if (use_mpi)
+                            RAPIDSMPF_MPI(MPI_Abort(MPI_COMM_WORLD, 0));
+                        else
+                            std::exit(0);
+                    }
+                }
+            }
+        } catch (std::exception const& e) {
+            std::cerr << "Error parsing arguments: " << e.what() << std::endl;
+            if (use_mpi)
+                RAPIDSMPF_MPI(MPI_Abort(MPI_COMM_WORLD, -1));
+            else
+                std::exit(-1);
+        }
+        if (args_.file_pattern.empty()) {
+            std::cerr << "-F <pattern> is required" << std::endl;
+            if (use_mpi)
+                RAPIDSMPF_MPI(MPI_Abort(MPI_COMM_WORLD, -1));
+            else
+                std::exit(-1);
+        }
+        if (args_.rmm_mr == "cuda") {
+            if (rank == 0) {
+                std::cout << "WARNING: using the default cuda memory resource (-m cuda) "
+                             "might leak memory!"
+                          << std::endl;
+            }
+        }
+    }
+
+    Args const& get() const {
+        return args_;
+    }
+
+  private:
+    Args args_{};
+};
+
+struct PackedItem {
+    // Ownership: we store size and buffer pointer for the packed payload
+    std::unique_ptr<PackedData> packed;  // original packed cudf table/column
+};
+
+struct BuffersToSend {
+    // For each op, we will send these items
+    std::vector<PackedItem> items;
+    std::size_t total_uncompressed_bytes{0};
+};
+
+BuffersToSend make_packed_items(
+    cudf::table const& table,
+    PackMode mode,
+    rmm::cuda_stream_view stream,
+    BufferResource* br
+) {
+    BuffersToSend ret{};
+    if (mode == PackMode::Table) {
+        auto item = PackedItem{};
+        item.packed = pack_table_to_packed(table.view(), stream, br);
+        ret.total_uncompressed_bytes +=
+            item.packed->data->size + item.packed->metadata->size();
+        ret.items.emplace_back(std::move(item));
+    } else {
+        auto tv = table.view();
+        for (cudf::size_type i = 0; i < tv.num_columns(); ++i) {
+            cudf::table_view col_tv{std::vector<cudf::column_view>{tv.column(i)}};
+            auto item = PackedItem{};
+            item.packed = pack_table_to_packed(col_tv, stream, br);
+            ret.total_uncompressed_bytes +=
+                item.packed->data->size + item.packed->metadata->size();
+            ret.items.emplace_back(std::move(item));
+        }
+    }
+    return ret;
+}
+
+// Send/recv helpers: send a header (compressed size) as host buffer.
+struct SizeHeader {
+    std::uint64_t bytes;
+};
+
+struct Timings {
+    double compress_s{0.0};
+    double decompress_s{0.0};
+    double comp_send_s{0.0};
+    double recv_decomp_s{0.0};
+    double send_only_s{0.0};
+    double recv_only_s{0.0};
+};
+
+// Returns timings and bytes counters
+struct Counters {
+    std::size_t logical_uncompressed_bytes{0};
+    std::size_t logical_compressed_bytes{0};
+};
+
+struct RunResult {
+    Timings times;
+    Counters counts;
+};
+
+RunResult run_once(
+    std::shared_ptr<Communicator> const& comm,
+    Args const& args,
+    rmm::cuda_stream_view stream,
+    BufferResource* br,
+    std::shared_ptr<Statistics> const& statistics,
+    BuffersToSend const& data,
+    NvcompCodec& codec
+) {
+    auto const nranks = comm->nranks();
+    auto const rank = comm->rank();
+    auto const dst = static_cast<Rank>((rank + 1) % nranks);
+    auto const src = static_cast<Rank>((rank - 1 + nranks) % nranks);
+
+    Tag tag_size{1, 0};
+    Tag tag_payload{1, 1};
+    Tag tag_nocomp{2, 0};
+
+    // Clone packed items into raw device buffers for repeated ops
+    std::vector<std::unique_ptr<Buffer>> nocomp_payloads;
+    nocomp_payloads.reserve(data.items.size());
+    for (auto const& it : data.items) {
+        // Copy metadata + data into a contiguous device buffer for pure send path?
+        // For pure send/recv, we only send the device payload; metadata isn't needed for
+        // metrics. We'll send the packed->data buffer.
+        auto reservation = br->reserve_or_fail(it.packed->data->size, MemoryType::DEVICE);
+        auto buf = br->allocate(it.packed->data->size, stream, reservation);
+        buffer_copy(*buf, *it.packed->data, it.packed->data->size);
+        nocomp_payloads.emplace_back(std::move(buf));
+    }
+
+    // Pre-allocate compression outputs for each item
+    std::vector<std::unique_ptr<Buffer>> comp_outputs;
+    std::vector<std::size_t> comp_output_sizes(data.items.size());
+    comp_outputs.reserve(data.items.size());
+    for (std::size_t i = 0; i < data.items.size(); ++i) {
+        auto const in_bytes = data.items[i].packed->data->size;
+        auto const max_out = codec.get_max_compressed_bytes(in_bytes);
+        auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE);
+        comp_outputs.emplace_back(br->allocate(max_out, stream, reservation));
+    }
+
+    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
+    auto t0 = Clock::now();
+    // Compress all items (single batch) on stream
+    for (std::size_t i = 0; i < data.items.size(); ++i) {
+        void const* d_in = data.items[i].packed->data->device()->get()->data();
+        void* d_out = comp_outputs[i]->device()->get()->data();
+        std::size_t out_bytes = 0;
+        codec.compress(d_in, data.items[i].packed->data->size, d_out, &out_bytes, stream);
+        comp_output_sizes[i] = out_bytes;
+    }
+    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
+    auto t1 = Clock::now();
+
+    // Phase A: pure send/recv (no compression)
+    auto a0 = Clock::now();
+    std::vector<std::unique_ptr<Communicator::Future>> send_futs;
+    std::vector<std::unique_ptr<Communicator::Future>> recv_futs;
+    send_futs.reserve(args.num_ops * nocomp_payloads.size());
+    recv_futs.reserve(args.num_ops * nocomp_payloads.size());
+
+    for (std::uint64_t op = 0; op < args.num_ops; ++op) {
+        for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
+            // post recv first
+            if (src != rank) {
+                auto res =
+                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
+                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
+                recv_futs.push_back(comm->recv(src, tag_nocomp, std::move(recv_buf)));
+            }
+        }
+        for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
+            if (dst != rank) {
+                auto res =
+                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
+                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
+                buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
+                send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp));
+            }
+        }
+    }
+    while (!send_futs.empty()) {
+        std::ignore = comm->test_some(send_futs);
+    }
+    auto a1 = Clock::now();
+    while (!recv_futs.empty()) {
+        std::ignore = comm->test_some(recv_futs);
+    }
+    auto a2 = Clock::now();
+
+    // Phase B: compressed path (send size header, then compressed payload)
+    auto b0 = Clock::now();
+    std::vector<std::unique_ptr<Communicator::Future>> send_hdr_futs;
+    std::vector<std::unique_ptr<Communicator::Future>> send_cmp_futs;
+    std::vector<std::unique_ptr<Communicator::Future>> recv_hdr_futs;
+    std::vector<std::unique_ptr<Communicator::Future>> recv_cmp_futs;
+    send_hdr_futs.reserve(args.num_ops * data.items.size());
+    send_cmp_futs.reserve(args.num_ops * data.items.size());
+    recv_hdr_futs.reserve(args.num_ops * data.items.size());
+    recv_cmp_futs.reserve(args.num_ops * data.items.size());
+
+    for (std::uint64_t op = 0; op < args.num_ops; ++op) {
+        for (std::size_t i = 0; i < data.items.size(); ++i) {
+            // post recv header
+            if (src != rank) {
+                auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST);
+                auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h);
+                recv_hdr_futs.push_back(comm->recv(src, tag_size, std::move(hdr)));
+            }
+        }
+        for (std::size_t i = 0; i < data.items.size(); ++i) {
+            if (dst != rank) {
+                auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST);
+                auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h);
+                // write header
+                hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
+                    SizeHeader h{static_cast<std::uint64_t>(comp_output_sizes[i])};
+                    std::memcpy(p, &h, sizeof(SizeHeader));
+                });
+                send_hdr_futs.push_back(comm->send(std::move(hdr), dst, tag_size));
+            }
+        }
+    }
+    while (!send_hdr_futs.empty()) {
+        std::ignore = comm->test_some(send_hdr_futs);
+    }
+    while (!recv_hdr_futs.empty()) {
+        std::ignore = comm->test_some(recv_hdr_futs);
+    }
+
+    // Post payload recvs now that we know sizes
+    for (std::uint64_t op = 0; op < args.num_ops; ++op) {
+        for (std::size_t i = 0; i < data.items.size(); ++i) {
+            if (src != rank) {
+                // reuse comp_output_sizes[i] as expected size since peers are symmetric
+                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                auto buf = br->allocate(comp_output_sizes[i], stream, res);
+                recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf)));
+            }
+        }
+        for (std::size_t i = 0; i < data.items.size(); ++i) {
+            if (dst != rank) {
+                // send compressed
+                auto tmp_buf =
+                    br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf);
+                buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
+                send_cmp_futs.push_back(
+                    comm->send(std::move(send_buf), dst, tag_payload)
+                );
+            }
+        }
+    }
+    while (!send_cmp_futs.empty()) {
+        std::ignore = comm->test_some(send_cmp_futs);
+    }
+    auto b1 = Clock::now();
+    while (!recv_cmp_futs.empty()) {
+        std::ignore = comm->test_some(recv_cmp_futs);
+    }
+    auto b2 = Clock::now();
+
+    // Decompress received buffers (simulate by decompressing our own produced outputs in
+    // symmetric setup)
+    auto c0 = Clock::now();
+    for (std::size_t i = 0; i < data.items.size(); ++i) {
+        auto const out_bytes = data.items[i].packed->data->size;
+        auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE);
+        auto out = br->allocate(out_bytes, stream, res);
+        void const* d_in = comp_outputs[i]->device()->get()->data();
+        void* d_out = out->device()->get()->data();
+        codec.decompress(d_in, comp_output_sizes[i], d_out, out_bytes, stream);
+    }
+    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
+    auto c1 = Clock::now();
+
+    RunResult result{};
+    result.times.compress_s = std::chrono::duration<double>(t1 - t0).count();
+    result.times.send_only_s = std::chrono::duration<double>(a1 - a0).count();
+    result.times.recv_only_s = std::chrono::duration<double>(a2 - a1).count();
+    result.times.comp_send_s = std::chrono::duration<double>(b1 - b0).count();
+    result.times.recv_decomp_s = std::chrono::duration<double>(b2 - b1).count()
+                                 + std::chrono::duration<double>(c1 - c0).count();
+    result.times.decompress_s = std::chrono::duration<double>(c1 - c0).count();
+
+    result.counts.logical_uncompressed_bytes =
+        data.total_uncompressed_bytes * args.num_ops;
+    result.counts.logical_compressed_bytes =
+        std::accumulate(
+            comp_output_sizes.begin(), comp_output_sizes.end(), std::size_t{0}
+        )
+        * args.num_ops;
+    return result;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    // Check if we should use bootstrap mode with rrun
+    bool use_bootstrap = std::getenv("RAPIDSMPF_RANK") != nullptr;
+
+    int provided = 0;
+    if (!use_bootstrap) {
+        RAPIDSMPF_MPI(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided));
+        RAPIDSMPF_EXPECTS(
+            provided == MPI_THREAD_MULTIPLE,
+            "didn't get the requested thread level support: MPI_THREAD_MULTIPLE"
+        );
+    }
+
+    ArgumentParser parser{argc, argv, !use_bootstrap};
+    Args const& args = parser.get();
+
+    // Initialize configuration options from environment variables.
+    rapidsmpf::config::Options options{rapidsmpf::config::get_environment_variables()};
+
+    std::shared_ptr<Communicator> comm;
+    if (args.comm_type == "mpi") {
+        if (use_bootstrap) {
+            std::cerr << "Error: MPI communicator requires MPI initialization. Don't use "
+                         "with rrun or unset RAPIDSMPF_RANK."
+                      << std::endl;
+            return 1;
+        }
+        mpi::init(&argc, &argv);
+        comm = std::make_shared<MPI>(MPI_COMM_WORLD, options);
+    } else if (args.comm_type == "ucxx") {
+        if (use_bootstrap) {
+            comm = rapidsmpf::bootstrap::create_ucxx_comm(
+                rapidsmpf::bootstrap::Backend::AUTO, options
+            );
+        } else {
+            comm = rapidsmpf::ucxx::init_using_mpi(MPI_COMM_WORLD, options);
+        }
+    } else {
+        std::cerr << "Unknown communicator: " << args.comm_type << std::endl;
+        return 1;
+    }
+
+    auto& log = comm->logger();
+    rmm::cuda_stream_view stream = cudf::get_default_stream();
+
+    // RMM setup
+    auto const mr_stack = set_current_rmm_stack(args.rmm_mr);
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
+    BufferResource br{mr};
+
+    // Hardware info
+    {
+        std::stringstream ss;
+        auto const cur_dev = rmm::get_current_cuda_device().value();
+        std::string pci_bus_id(16, '\0');
+        RAPIDSMPF_CUDA_TRY(
+            cudaDeviceGetPCIBusId(pci_bus_id.data(), pci_bus_id.size(), cur_dev)
+        );
+        cudaDeviceProp properties;
+        RAPIDSMPF_CUDA_TRY(cudaGetDeviceProperties(&properties, 0));
+        ss << "Hardware setup: \n";
+        ss << "  GPU (" << properties.name << "): \n";
+        ss << "    Device number: " << cur_dev << "\n";
+        ss << "    PCI Bus ID: " << pci_bus_id.substr(0, pci_bus_id.find('\0')) << "\n";
+        ss << "    Total Memory: " << format_nbytes(properties.totalGlobalMem, 0) << "\n";
+        ss << "  Comm: " << *comm << "\n";
+        log.print(ss.str());
+    }
+
+    // Stats and CUPTI
+    auto stats = std::make_shared<rapidsmpf::Statistics>(/* enable = */ false);
+#ifdef RAPIDSMPF_HAVE_CUPTI
+    std::unique_ptr<rapidsmpf::CuptiMonitor> cupti_monitor;
+    if (args.enable_cupti_monitoring) {
+        cupti_monitor = std::make_unique<rapidsmpf::CuptiMonitor>();
+        cupti_monitor->start_monitoring();
+        log.print("CUPTI memory monitoring enabled");
+    }
+#endif
+
+    // File selection per rank
+    auto files = expand_glob(args.file_pattern);
+    if (files.empty()) {
+        if (comm->rank() == 0)
+            log.print("No files matched pattern: " + args.file_pattern);
+        if (!use_bootstrap)
+            RAPIDSMPF_MPI(MPI_Finalize());
+        return 1;
+    }
+    auto my_file = files[static_cast<std::size_t>(comm->rank()) % files.size()];
+    if (comm->rank() == 0)
+        log.print(
+            "Using file pattern: " + args.file_pattern + ", first file: " + files.front()
+        );
+    log.print("Rank " + std::to_string(comm->rank()) + " reading: " + my_file);
+
+    // Read Parquet into cudf::table
+    cudf::io::parquet_reader_options reader_opts =
+        cudf::io::parquet_reader_options::builder(cudf::io::source_info{my_file});
+    auto table_with_md = cudf::io::read_parquet(reader_opts);
+    auto& table = table_with_md.tbl;
+
+    // Pack per mode
+    auto packed = make_packed_items(*table, args.pack_mode, stream, &br);
+
+    // Prepare codec
+    auto codec = make_codec(args.algo, args.params);
+
+    // Runs
+    std::vector<double> compress_t, decompress_t, comp_send_t, recv_decomp_t, send_t,
+        recv_t;
+    compress_t.reserve(args.num_runs);
+    decompress_t.reserve(args.num_runs);
+    comp_send_t.reserve(args.num_runs);
+    recv_decomp_t.reserve(args.num_runs);
+    send_t.reserve(args.num_runs);
+    recv_t.reserve(args.num_runs);
+
+    std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops;
+
+    for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) {
+        if (i == args.num_warmups + args.num_runs - 1) {
+            stats = std::make_shared<rapidsmpf::Statistics>(/* enable = */ true);
+        }
+        auto rr = run_once(comm, args, stream, &br, stats, packed, *codec);
+
+        double cBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                      / rr.times.compress_s;
+        double dBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                      / rr.times.decompress_s;
+        double csBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                       / rr.times.comp_send_s;
+        double rdBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                       / rr.times.recv_decomp_s;
+        double sBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                      / rr.times.send_only_s;
+        double rBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                      / rr.times.recv_only_s;
+
+        std::stringstream ss;
+        ss << "compress: " << format_nbytes(cBps)
+           << "/s | decompress: " << format_nbytes(dBps)
+           << "/s | comp+send: " << format_nbytes(csBps)
+           << "/s | recv+decomp: " << format_nbytes(rdBps)
+           << "/s | send-only: " << format_nbytes(sBps)
+           << "/s | recv-only: " << format_nbytes(rBps) << "/s";
+        if (i < args.num_warmups)
+            ss << " (warmup run)";
+        log.print(ss.str());
+
+        if (i >= args.num_warmups) {
+            compress_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / cBps
+            );
+            decompress_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / dBps
+            );
+            comp_send_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / csBps
+            );
+            recv_decomp_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / rdBps
+            );
+            send_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / sBps
+            );
+            recv_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / rBps
+            );
+        }
+    }
+
+    // Means
+    auto harmonic_mean = [](std::vector<double> const& v) {
+        double denom_sum = 0.0;
+        for (auto x : v)
+            denom_sum += 1.0 / x;
+        return static_cast<double>(v.size()) / denom_sum;
+    };
+
+    if (!compress_t.empty()) {
+        double mean_elapsed_c = harmonic_mean(compress_t);
+        double mean_elapsed_d = harmonic_mean(decompress_t);
+        double mean_elapsed_cs = harmonic_mean(comp_send_t);
+        double mean_elapsed_rd = harmonic_mean(recv_decomp_t);
+        double mean_elapsed_s = harmonic_mean(send_t);
+        double mean_elapsed_r = harmonic_mean(recv_t);
+
+        std::stringstream ss;
+        ss << "means: compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s"
+           << " | decompress: " << format_nbytes(logical_bytes / mean_elapsed_d) << "/s"
+           << " | comp+send: " << format_nbytes(logical_bytes / mean_elapsed_cs) << "/s"
+           << " | recv+decomp: " << format_nbytes(logical_bytes / mean_elapsed_rd) << "/s"
+           << " | send-only: " << format_nbytes(logical_bytes / mean_elapsed_s) << "/s"
+           << " | recv-only: " << format_nbytes(logical_bytes / mean_elapsed_r) << "/s";
+        log.print(ss.str());
+    }
+
+#ifdef RAPIDSMPF_HAVE_CUPTI
+    if (args.enable_cupti_monitoring && cupti_monitor) {
+        cupti_monitor->stop_monitoring();
+        std::string csv_filename =
+            args.cupti_csv_prefix + std::to_string(comm->rank()) + ".csv";
+        try {
+            cupti_monitor->write_csv(csv_filename);
+            log.print(
+                "CUPTI memory data written to " + csv_filename + " ("
+                + std::to_string(cupti_monitor->get_sample_count()) + " samples, "
+                + std::to_string(cupti_monitor->get_total_callback_count())
+                + " callbacks)"
+            );
+            if (comm->rank() == 0) {
+                log.print(
+                    "CUPTI Callback Summary:\n" + cupti_monitor->get_callback_summary()
+                );
+            }
+        } catch (std::exception const& e) {
+            log.print("Failed to write CUPTI CSV file: " + std::string(e.what()));
+        }
+    }
+#endif
+
+    if (!use_bootstrap) {
+        RAPIDSMPF_MPI(MPI_Finalize());
+    }
+    return 0;
+}

From d0575cbd72ae20ec0cbb2bb2e422c9ef9dac8b7a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 6 Nov 2025 07:49:13 -0800
Subject: [PATCH 02/26] Fixes

---
 cpp/benchmarks/bench_comp_comm.cpp | 109 +++++++++++++++++++++--------
 1 file changed, 78 insertions(+), 31 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 6cc557f32..e2e0769ca 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -197,7 +197,9 @@ class LZ4Codec final : public NvcompCodec {
     explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
 
     std::size_t get_max_compressed_bytes(std::size_t in_bytes) override {
-        nvcomp::LZ4Manager mgr{static_cast<int>(chunk_size_), 0};
+        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
+        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, 0};
         auto cfg = mgr.configure_compression(in_bytes);
         return cfg.max_compressed_buffer_size;
     }
@@ -209,12 +211,16 @@ class LZ4Codec final : public NvcompCodec {
         std::size_t* out_bytes,
         rmm::cuda_stream_view stream
     ) override {
-        nvcomp::LZ4Manager mgr{static_cast<int>(chunk_size_), 0, stream.value()};
+        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
+        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        mgr.compress(d_out, d_in, cfg);
-        // Compressed size is stored at the beginning of output; ask manager
-        auto info = mgr.get_compress_result(d_out);
-        *out_bytes = info.compressed_bytes;
+        mgr.compress(
+            static_cast<uint8_t const*>(d_in),
+            static_cast<uint8_t*>(d_out),
+            cfg,
+            out_bytes
+        );
     }
 
     void decompress(
@@ -224,10 +230,15 @@ class LZ4Codec final : public NvcompCodec {
         std::size_t out_bytes,
         rmm::cuda_stream_view stream
     ) override {
-        nvcomp::LZ4Manager mgr{static_cast<int>(chunk_size_), 0, stream.value()};
-        auto cfg = mgr.configure_decompression(d_in, in_bytes);
-        (void)out_bytes;  // decomp size implied by cfg
-        mgr.decompress(d_out, d_in, cfg);
+        (void)out_bytes;
+        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
+        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
+        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
+        size_t in_sizes[1] = {in_bytes};
+        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
+        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
+        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
     }
 
   private:
@@ -237,10 +248,16 @@ class LZ4Codec final : public NvcompCodec {
 class CascadedCodec final : public NvcompCodec {
   public:
     CascadedCodec(std::size_t chunk_size, int rle, int delta, int bitpack)
-        : opts_{rle != 0, delta != 0, bitpack != 0, static_cast<int>(chunk_size)} {}
+        : chunk_size_{chunk_size} {
+        copts_ = nvcompBatchedCascadedCompressDefaultOpts;
+        copts_.num_RLEs = rle ? 1 : 0;
+        copts_.num_deltas = delta ? 1 : 0;
+        copts_.use_bp = bitpack ? 1 : 0;
+        dopts_ = nvcompBatchedCascadedDecompressDefaultOpts;
+    }
 
     std::size_t get_max_compressed_bytes(std::size_t in_bytes) override {
-        nvcomp::CascadedManager mgr{opts_};
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, 0};
         auto cfg = mgr.configure_compression(in_bytes);
         return cfg.max_compressed_buffer_size;
     }
@@ -252,11 +269,14 @@ class CascadedCodec final : public NvcompCodec {
         std::size_t* out_bytes,
         rmm::cuda_stream_view stream
     ) override {
-        nvcomp::CascadedManager mgr{opts_, stream.value()};
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        mgr.compress(d_out, d_in, cfg);
-        auto info = mgr.get_compress_result(d_out);
-        *out_bytes = info.compressed_bytes;
+        mgr.compress(
+            static_cast<uint8_t const*>(d_in),
+            static_cast<uint8_t*>(d_out),
+            cfg,
+            out_bytes
+        );
     }
 
     void decompress(
@@ -266,14 +286,19 @@ class CascadedCodec final : public NvcompCodec {
         std::size_t out_bytes,
         rmm::cuda_stream_view stream
     ) override {
-        nvcomp::CascadedManager mgr{opts_, stream.value()};
-        auto cfg = mgr.configure_decompression(d_in, in_bytes);
         (void)out_bytes;
-        mgr.decompress(d_out, d_in, cfg);
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
+        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
+        size_t in_sizes[1] = {in_bytes};
+        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
+        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
+        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
     }
 
   private:
-    nvcomp::CascadedOptions opts_{};
+    std::size_t chunk_size_{};
+    nvcompBatchedCascadedCompressOpts_t copts_{};
+    nvcompBatchedCascadedDecompressOpts_t dopts_{};
 };
 
 std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p) {
@@ -293,12 +318,9 @@ static std::unique_ptr<PackedData> pack_table_to_packed(
     cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br
 ) {
     auto packed = cudf::pack(tv, stream, br->device_mr());
-    auto metadata =
-        std::make_unique<std::vector<std::uint8_t>>(std::move(packed.metadata));
-    auto buf = br->move(
-        std::make_unique<rmm::device_buffer>(std::move(packed.gpu_data)), stream
+    return std::make_unique<PackedData>(
+        std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream)
     );
-    return std::make_unique<PackedData>(std::move(metadata), std::move(buf));
 }
 
 struct ArgumentParser {
@@ -498,6 +520,7 @@ RunResult run_once(
     BuffersToSend const& data,
     NvcompCodec& codec
 ) {
+    (void)statistics;
     auto const nranks = comm->nranks();
     auto const rank = comm->rank();
     auto const dst = static_cast<Rank>((rank + 1) % nranks);
@@ -535,10 +558,23 @@ RunResult run_once(
     auto t0 = Clock::now();
     // Compress all items (single batch) on stream
     for (std::size_t i = 0; i < data.items.size(); ++i) {
-        void const* d_in = data.items[i].packed->data->device()->get()->data();
-        void* d_out = comp_outputs[i]->device()->get()->data();
         std::size_t out_bytes = 0;
-        codec.compress(d_in, data.items[i].packed->data->size, d_out, &out_bytes, stream);
+        // Use exclusive access to fetch pointers and call codec on the provided stream
+        auto* in_buf = data.items[i].packed->data.get();
+        auto* out_buf = comp_outputs[i].get();
+        in_buf->stream().synchronize();
+        out_buf->stream().synchronize();
+        auto* in_raw = data.items[i].packed->data->exclusive_data_access();
+        auto* out_raw = comp_outputs[i]->exclusive_data_access();
+        codec.compress(
+            static_cast<void const*>(in_raw),
+            data.items[i].packed->data->size,
+            static_cast<void*>(out_raw),
+            &out_bytes,
+            stream
+        );
+        comp_outputs[i]->unlock();
+        data.items[i].packed->data->unlock();
         comp_output_sizes[i] = out_bytes;
     }
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
@@ -659,9 +695,20 @@ RunResult run_once(
         auto const out_bytes = data.items[i].packed->data->size;
         auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE);
         auto out = br->allocate(out_bytes, stream, res);
-        void const* d_in = comp_outputs[i]->device()->get()->data();
-        void* d_out = out->device()->get()->data();
-        codec.decompress(d_in, comp_output_sizes[i], d_out, out_bytes, stream);
+        // Synchronize prior to exclusive access
+        comp_outputs[i]->stream().synchronize();
+        out->stream().synchronize();
+        auto* in_raw = comp_outputs[i]->exclusive_data_access();
+        auto* out_raw = out->exclusive_data_access();
+        codec.decompress(
+            static_cast<void const*>(in_raw),
+            comp_output_sizes[i],
+            static_cast<void*>(out_raw),
+            out_bytes,
+            stream
+        );
+        out->unlock();
+        comp_outputs[i]->unlock();
     }
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto c1 = Clock::now();

From 5a677f9cc8d10d9015b30d9726806b5e9a957030 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 6 Nov 2025 08:06:54 -0800
Subject: [PATCH 03/26] Fix illegal memory access

---
 cpp/benchmarks/bench_comp_comm.cpp | 112 +++++++++++++++++++----------
 1 file changed, 74 insertions(+), 38 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index e2e0769ca..5f88bc5eb 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -175,7 +175,9 @@ struct PhaseThroughputs {
 class NvcompCodec {
   public:
     virtual ~NvcompCodec() = default;
-    virtual std::size_t get_max_compressed_bytes(std::size_t uncompressed_bytes) = 0;
+    virtual std::size_t get_max_compressed_bytes(
+        std::size_t uncompressed_bytes, rmm::cuda_stream_view stream
+    ) = 0;
     virtual void compress(
         void const* d_in,
         std::size_t in_bytes,
@@ -196,10 +198,12 @@ class LZ4Codec final : public NvcompCodec {
   public:
     explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
 
-    std::size_t get_max_compressed_bytes(std::size_t in_bytes) override {
+    std::size_t get_max_compressed_bytes(
+        std::size_t in_bytes, rmm::cuda_stream_view stream
+    ) override {
         nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
         nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
-        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, 0};
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
         return cfg.max_compressed_buffer_size;
     }
@@ -215,12 +219,19 @@ class LZ4Codec final : public NvcompCodec {
         nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
         nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
+        size_t* pinned_bytes = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
+        ));
         mgr.compress(
             static_cast<uint8_t const*>(d_in),
             static_cast<uint8_t*>(d_out),
             cfg,
-            out_bytes
+            pinned_bytes
         );
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        *out_bytes = *pinned_bytes;
+        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
     }
 
     void decompress(
@@ -256,8 +267,10 @@ class CascadedCodec final : public NvcompCodec {
         dopts_ = nvcompBatchedCascadedDecompressDefaultOpts;
     }
 
-    std::size_t get_max_compressed_bytes(std::size_t in_bytes) override {
-        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, 0};
+    std::size_t get_max_compressed_bytes(
+        std::size_t in_bytes, rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
         return cfg.max_compressed_buffer_size;
     }
@@ -271,12 +284,19 @@ class CascadedCodec final : public NvcompCodec {
     ) override {
         nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
+        size_t* pinned_bytes = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
+        ));
         mgr.compress(
             static_cast<uint8_t const*>(d_in),
             static_cast<uint8_t*>(d_out),
             cfg,
-            out_bytes
+            pinned_bytes
         );
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        *out_bytes = *pinned_bytes;
+        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
     }
 
     void decompress(
@@ -549,7 +569,8 @@ RunResult run_once(
     comp_outputs.reserve(data.items.size());
     for (std::size_t i = 0; i < data.items.size(); ++i) {
         auto const in_bytes = data.items[i].packed->data->size;
-        auto const max_out = codec.get_max_compressed_bytes(in_bytes);
+        std::size_t const max_out =
+            (in_bytes == 0) ? 1 : codec.get_max_compressed_bytes(in_bytes, stream);
         auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE);
         comp_outputs.emplace_back(br->allocate(max_out, stream, reservation));
     }
@@ -558,24 +579,35 @@ RunResult run_once(
     auto t0 = Clock::now();
     // Compress all items (single batch) on stream
     for (std::size_t i = 0; i < data.items.size(); ++i) {
-        std::size_t out_bytes = 0;
-        // Use exclusive access to fetch pointers and call codec on the provided stream
-        auto* in_buf = data.items[i].packed->data.get();
-        auto* out_buf = comp_outputs[i].get();
-        in_buf->stream().synchronize();
-        out_buf->stream().synchronize();
-        auto* in_raw = data.items[i].packed->data->exclusive_data_access();
-        auto* out_raw = comp_outputs[i]->exclusive_data_access();
-        codec.compress(
-            static_cast<void const*>(in_raw),
-            data.items[i].packed->data->size,
-            static_cast<void*>(out_raw),
-            &out_bytes,
-            stream
+        auto const in_bytes = data.items[i].packed->data->size;
+        if (in_bytes == 0) {
+            comp_output_sizes[i] = 0;
+            continue;
+        }
+        // Ensure any prior writes to input are completed
+        data.items[i].packed->data->stream().synchronize();
+        // Launch compression on the output buffer's stream and record an event after
+        comp_outputs[i]->write_access(
+            [&codec, &data, i, in_bytes, &comp_output_sizes, stream](
+                std::byte* out_ptr, rmm::cuda_stream_view out_stream
+            ) {
+                (void)out_ptr;  // pointer used below
+                // Lock input for raw pointer access
+                auto* in_raw = data.items[i].packed->data->exclusive_data_access();
+                std::size_t out_bytes = 0;
+                codec.compress(
+                    static_cast<void const*>(in_raw),
+                    in_bytes,
+                    static_cast<void*>(out_ptr),
+                    &out_bytes,
+                    out_stream
+                );
+                // Ensure comp_bytes is populated before returning
+                RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(out_stream.value()));
+                data.items[i].packed->data->unlock();
+                comp_output_sizes[i] = out_bytes;
+            }
         );
-        comp_outputs[i]->unlock();
-        data.items[i].packed->data->unlock();
-        comp_output_sizes[i] = out_bytes;
     }
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto t1 = Clock::now();
@@ -693,22 +725,26 @@ RunResult run_once(
     auto c0 = Clock::now();
     for (std::size_t i = 0; i < data.items.size(); ++i) {
         auto const out_bytes = data.items[i].packed->data->size;
+        if (out_bytes == 0) {
+            continue;
+        }
         auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE);
         auto out = br->allocate(out_bytes, stream, res);
-        // Synchronize prior to exclusive access
+        // Ensure compressed outputs are ready before using as input
         comp_outputs[i]->stream().synchronize();
-        out->stream().synchronize();
-        auto* in_raw = comp_outputs[i]->exclusive_data_access();
-        auto* out_raw = out->exclusive_data_access();
-        codec.decompress(
-            static_cast<void const*>(in_raw),
-            comp_output_sizes[i],
-            static_cast<void*>(out_raw),
-            out_bytes,
-            stream
-        );
-        out->unlock();
-        comp_outputs[i]->unlock();
+        out->write_access([&codec, &comp_outputs, &comp_output_sizes, i, out_bytes](
+                              std::byte* out_ptr, rmm::cuda_stream_view out_stream
+                          ) {
+            auto* in_raw = comp_outputs[i]->exclusive_data_access();
+            codec.decompress(
+                static_cast<void const*>(in_raw),
+                comp_output_sizes[i],
+                static_cast<void*>(out_ptr),
+                out_bytes,
+                out_stream
+            );
+            comp_outputs[i]->unlock();
+        });
     }
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto c1 = Clock::now();

From dd793fdae7dfe09943803ff844820693f56da1c1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 6 Nov 2025 08:53:41 -0800
Subject: [PATCH 04/26] Wait for buffer before sending

---
 cpp/benchmarks/bench_comp_comm.cpp | 34 +++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 5f88bc5eb..a678796df 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -635,6 +635,9 @@ RunResult run_once(
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
                 auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
                 buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
+                if (!send_buf->is_latest_write_done()) {
+                    send_buf->stream().synchronize();
+                }
                 send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp));
             }
         }
@@ -677,6 +680,9 @@ RunResult run_once(
                     SizeHeader h{static_cast<std::uint64_t>(comp_output_sizes[i])};
                     std::memcpy(p, &h, sizeof(SizeHeader));
                 });
+                if (!hdr->is_latest_write_done()) {
+                    hdr->stream().synchronize();
+                }
                 send_hdr_futs.push_back(comm->send(std::move(hdr), dst, tag_size));
             }
         }
@@ -693,21 +699,29 @@ RunResult run_once(
         for (std::size_t i = 0; i < data.items.size(); ++i) {
             if (src != rank) {
                 // reuse comp_output_sizes[i] as expected size since peers are symmetric
-                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                auto buf = br->allocate(comp_output_sizes[i], stream, res);
-                recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf)));
+                if (comp_output_sizes[i] > 0) {
+                    auto res =
+                        br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                    auto buf = br->allocate(comp_output_sizes[i], stream, res);
+                    recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf)));
+                }
             }
         }
         for (std::size_t i = 0; i < data.items.size(); ++i) {
             if (dst != rank) {
                 // send compressed
-                auto tmp_buf =
-                    br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf);
-                buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                send_cmp_futs.push_back(
-                    comm->send(std::move(send_buf), dst, tag_payload)
-                );
+                if (comp_output_sizes[i] > 0) {
+                    auto tmp_buf =
+                        br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                    auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf);
+                    buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
+                    if (!send_buf->is_latest_write_done()) {
+                        send_buf->stream().synchronize();
+                    }
+                    send_cmp_futs.push_back(
+                        comm->send(std::move(send_buf), dst, tag_payload)
+                    );
+                }
             }
         }
     }

From 19c81c8e1dc0703d8489fd63ca16c416cb11247d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 6 Nov 2025 10:20:49 -0800
Subject: [PATCH 05/26] Improve measuring received messages

---
 cpp/benchmarks/bench_comp_comm.cpp | 40 +++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index a678796df..43d2eef63 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -476,7 +476,10 @@ struct PackedItem {
 struct BuffersToSend {
     // For each op, we will send these items
     std::vector<PackedItem> items;
+    // Uncompressed bytes that represent the actual payload we transmit (device data only)
     std::size_t total_uncompressed_bytes{0};
+    // Convenience: device payload bytes (same as total_uncompressed_bytes here)
+    std::size_t total_payload_bytes{0};
 };
 
 BuffersToSend make_packed_items(
@@ -489,8 +492,8 @@ BuffersToSend make_packed_items(
     if (mode == PackMode::Table) {
         auto item = PackedItem{};
         item.packed = pack_table_to_packed(table.view(), stream, br);
-        ret.total_uncompressed_bytes +=
-            item.packed->data->size + item.packed->metadata->size();
+        ret.total_uncompressed_bytes += item.packed->data->size;
+        ret.total_payload_bytes += item.packed->data->size;
         ret.items.emplace_back(std::move(item));
     } else {
         auto tv = table.view();
@@ -498,8 +501,8 @@ BuffersToSend make_packed_items(
             cudf::table_view col_tv{std::vector<cudf::column_view>{tv.column(i)}};
             auto item = PackedItem{};
             item.packed = pack_table_to_packed(col_tv, stream, br);
-            ret.total_uncompressed_bytes +=
-                item.packed->data->size + item.packed->metadata->size();
+            ret.total_uncompressed_bytes += item.packed->data->size;
+            ret.total_payload_bytes += item.packed->data->size;
             ret.items.emplace_back(std::move(item));
         }
     }
@@ -642,14 +645,23 @@ RunResult run_once(
             }
         }
     }
-    while (!send_futs.empty()) {
-        std::ignore = comm->test_some(send_futs);
-    }
-    auto a1 = Clock::now();
-    while (!recv_futs.empty()) {
-        std::ignore = comm->test_some(recv_futs);
+    // Drive send/recv completion concurrently and timestamp each independently
+    std::optional<Clock::time_point> send_done_tp;
+    std::optional<Clock::time_point> recv_done_tp;
+    while (!send_futs.empty() || !recv_futs.empty()) {
+        if (!send_futs.empty()) {
+            std::ignore = comm->test_some(send_futs);
+            if (!send_done_tp && send_futs.empty())
+                send_done_tp = Clock::now();
+        }
+        if (!recv_futs.empty()) {
+            std::ignore = comm->test_some(recv_futs);
+            if (!recv_done_tp && recv_futs.empty())
+                recv_done_tp = Clock::now();
+        }
     }
-    auto a2 = Clock::now();
+    auto a1 = send_done_tp.value_or(Clock::now());
+    auto a2 = recv_done_tp.value_or(Clock::now());
 
     // Phase B: compressed path (send size header, then compressed payload)
     auto b0 = Clock::now();
@@ -766,14 +778,14 @@ RunResult run_once(
     RunResult result{};
     result.times.compress_s = std::chrono::duration<double>(t1 - t0).count();
     result.times.send_only_s = std::chrono::duration<double>(a1 - a0).count();
-    result.times.recv_only_s = std::chrono::duration<double>(a2 - a1).count();
+    result.times.recv_only_s = std::chrono::duration<double>(a2 - a0).count();
     result.times.comp_send_s = std::chrono::duration<double>(b1 - b0).count();
     result.times.recv_decomp_s = std::chrono::duration<double>(b2 - b1).count()
                                  + std::chrono::duration<double>(c1 - c0).count();
     result.times.decompress_s = std::chrono::duration<double>(c1 - c0).count();
 
-    result.counts.logical_uncompressed_bytes =
-        data.total_uncompressed_bytes * args.num_ops;
+    // Use payload (device) bytes as the logical uncompressed size for throughput
+    result.counts.logical_uncompressed_bytes = data.total_payload_bytes * args.num_ops;
     result.counts.logical_compressed_bytes =
         std::accumulate(
             comp_output_sizes.begin(), comp_output_sizes.end(), std::size_t{0}

From 802d95c3c8024b2a1efa4122a18402c19d369a7c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 7 Nov 2025 02:12:21 -0800
Subject: [PATCH 06/26] Rename to column

---
 cpp/benchmarks/bench_comp_comm.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 43d2eef63..a8718b308 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -57,7 +57,7 @@ namespace {
 
 enum class PackMode {
     Table,
-    Columns
+    Column
 };
 enum class Algo {
     Cascaded,
@@ -376,12 +376,11 @@ struct ArgumentParser {
                         std::string v{optarg};
                         if (v == "table")
                             args_.pack_mode = PackMode::Table;
-                        else if (v == "columns")
-                            args_.pack_mode = PackMode::Columns;
+                        else if (v == "column")
+                            args_.pack_mode = PackMode::Column;
                         else
                             RAPIDSMPF_FAIL(
-                                "-P must be one of {table, columns}",
-                                std::invalid_argument
+                                "-P must be one of {table, column}", std::invalid_argument
                             );
                         break;
                     }
@@ -420,7 +419,7 @@ struct ArgumentParser {
                            << "  -m <mr>      RMM MR {cuda, pool, async, managed} "
                               "(default: pool)\n"
                            << "  -F <pattern> Parquet file glob/pattern (required)\n"
-                           << "  -P <mode>    Packing mode {table, columns} (default: "
+                           << "  -P <mode>    Packing mode {table, column} (default: "
                               "table)\n"
                            << "  -A <algo>    {cascaded, lz4} (default: cascaded)\n"
                            << "  -K <kv>      Algo params, e.g. "
@@ -641,6 +640,8 @@ RunResult run_once(
                 if (!send_buf->is_latest_write_done()) {
                     send_buf->stream().synchronize();
                 }
+                // std::cout << "Sending payload of size " << nocomp_payloads[i]->size <<
+                // " to rank " << dst << std::endl;
                 send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp));
             }
         }

From d3720d5f7515996376184971c3d88c5ea8592162 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 06:37:55 -0800
Subject: [PATCH 07/26] Use roundtrip transfers

---
 cpp/benchmarks/bench_comp_comm.cpp | 312 ++++++++++++++---------------
 1 file changed, 148 insertions(+), 164 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index a8718b308..b28b2b5fb 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -97,7 +97,6 @@ std::vector<std::string> expand_glob(std::string const& pattern) {
         }
     }
     globfree(&glob_result);
-    std::sort(files.begin(), files.end());
     return files;
 }
 
@@ -516,10 +515,9 @@ struct SizeHeader {
 struct Timings {
     double compress_s{0.0};
     double decompress_s{0.0};
-    double comp_send_s{0.0};
-    double recv_decomp_s{0.0};
-    double send_only_s{0.0};
-    double recv_only_s{0.0};
+    // Round-trip totals measured at initiator
+    double rt_nocomp_s{0.0};
+    double rt_comp_s{0.0};
 };
 
 // Returns timings and bytes counters
@@ -540,7 +538,8 @@ RunResult run_once(
     BufferResource* br,
     std::shared_ptr<Statistics> const& statistics,
     BuffersToSend const& data,
-    NvcompCodec& codec
+    NvcompCodec& codec,
+    std::uint64_t run_index
 ) {
     (void)statistics;
     auto const nranks = comm->nranks();
@@ -548,9 +547,10 @@ RunResult run_once(
     auto const dst = static_cast<Rank>((rank + 1) % nranks);
     auto const src = static_cast<Rank>((rank - 1 + nranks) % nranks);
 
-    Tag tag_size{1, 0};
-    Tag tag_payload{1, 1};
-    Tag tag_nocomp{2, 0};
+    Tag tag_ping_nc{10, 0};
+    Tag tag_pong_nc{10, 1};
+    Tag tag_ping_c{11, 0};
+    Tag tag_pong_c{11, 1};
 
     // Clone packed items into raw device buffers for repeated ops
     std::vector<std::unique_ptr<Buffer>> nocomp_payloads;
@@ -614,138 +614,138 @@ RunResult run_once(
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto t1 = Clock::now();
 
-    // Phase A: pure send/recv (no compression)
-    auto a0 = Clock::now();
-    std::vector<std::unique_ptr<Communicator::Future>> send_futs;
-    std::vector<std::unique_ptr<Communicator::Future>> recv_futs;
-    send_futs.reserve(args.num_ops * nocomp_payloads.size());
-    recv_futs.reserve(args.num_ops * nocomp_payloads.size());
-
+    // Phase A (RTT no compression): ping-pong per op
+    Duration rt_nc_total{0};
     for (std::uint64_t op = 0; op < args.num_ops; ++op) {
-        for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-            // post recv first
-            if (src != rank) {
+        bool initiator =
+            ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
+        auto rt_start = Clock::now();
+        if (initiator) {
+            // Initiator: post pong recvs, then ping sends
+            std::vector<std::unique_ptr<Communicator::Future>> pong_recvs;
+            std::vector<std::unique_ptr<Communicator::Future>> ping_sends;
+            pong_recvs.reserve(nocomp_payloads.size());
+            ping_sends.reserve(nocomp_payloads.size());
+            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
                 auto res =
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
                 auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
-                recv_futs.push_back(comm->recv(src, tag_nocomp, std::move(recv_buf)));
+                pong_recvs.push_back(comm->recv(src, tag_pong_nc, std::move(recv_buf)));
             }
-        }
-        for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-            if (dst != rank) {
+            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
                 auto res =
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
                 auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
                 buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
-                if (!send_buf->is_latest_write_done()) {
+                if (!send_buf->is_latest_write_done())
                     send_buf->stream().synchronize();
-                }
-                // std::cout << "Sending payload of size " << nocomp_payloads[i]->size <<
-                // " to rank " << dst << std::endl;
-                send_futs.push_back(comm->send(std::move(send_buf), dst, tag_nocomp));
+                ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc));
             }
-        }
-    }
-    // Drive send/recv completion concurrently and timestamp each independently
-    std::optional<Clock::time_point> send_done_tp;
-    std::optional<Clock::time_point> recv_done_tp;
-    while (!send_futs.empty() || !recv_futs.empty()) {
-        if (!send_futs.empty()) {
-            std::ignore = comm->test_some(send_futs);
-            if (!send_done_tp && send_futs.empty())
-                send_done_tp = Clock::now();
-        }
-        if (!recv_futs.empty()) {
-            std::ignore = comm->test_some(recv_futs);
-            if (!recv_done_tp && recv_futs.empty())
-                recv_done_tp = Clock::now();
-        }
-    }
-    auto a1 = send_done_tp.value_or(Clock::now());
-    auto a2 = recv_done_tp.value_or(Clock::now());
-
-    // Phase B: compressed path (send size header, then compressed payload)
-    auto b0 = Clock::now();
-    std::vector<std::unique_ptr<Communicator::Future>> send_hdr_futs;
-    std::vector<std::unique_ptr<Communicator::Future>> send_cmp_futs;
-    std::vector<std::unique_ptr<Communicator::Future>> recv_hdr_futs;
-    std::vector<std::unique_ptr<Communicator::Future>> recv_cmp_futs;
-    send_hdr_futs.reserve(args.num_ops * data.items.size());
-    send_cmp_futs.reserve(args.num_ops * data.items.size());
-    recv_hdr_futs.reserve(args.num_ops * data.items.size());
-    recv_cmp_futs.reserve(args.num_ops * data.items.size());
-
-    for (std::uint64_t op = 0; op < args.num_ops; ++op) {
-        for (std::size_t i = 0; i < data.items.size(); ++i) {
-            // post recv header
-            if (src != rank) {
-                auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST);
-                auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h);
-                recv_hdr_futs.push_back(comm->recv(src, tag_size, std::move(hdr)));
+            while (!ping_sends.empty()) {
+                std::ignore = comm->test_some(ping_sends);
             }
-        }
-        for (std::size_t i = 0; i < data.items.size(); ++i) {
-            if (dst != rank) {
-                auto res_h = br->reserve_or_fail(sizeof(SizeHeader), MemoryType::HOST);
-                auto hdr = br->allocate(sizeof(SizeHeader), stream, res_h);
-                // write header
-                hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
-                    SizeHeader h{static_cast<std::uint64_t>(comp_output_sizes[i])};
-                    std::memcpy(p, &h, sizeof(SizeHeader));
-                });
-                if (!hdr->is_latest_write_done()) {
-                    hdr->stream().synchronize();
-                }
-                send_hdr_futs.push_back(comm->send(std::move(hdr), dst, tag_size));
+            while (!pong_recvs.empty()) {
+                std::ignore = comm->test_some(pong_recvs);
+            }
+        } else {
+            // Responder: post ping recvs, then pong sends
+            std::vector<std::unique_ptr<Communicator::Future>> ping_recvs;
+            std::vector<std::unique_ptr<Communicator::Future>> pong_sends;
+            ping_recvs.reserve(nocomp_payloads.size());
+            pong_sends.reserve(nocomp_payloads.size());
+            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
+                auto res =
+                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
+                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
+                ping_recvs.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf)));
+            }
+            while (!ping_recvs.empty()) {
+                std::ignore = comm->test_some(ping_recvs);
+            }
+            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
+                auto res =
+                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
+                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
+                buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
+                if (!send_buf->is_latest_write_done())
+                    send_buf->stream().synchronize();
+                pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_nc));
+            }
+            while (!pong_sends.empty()) {
+                std::ignore = comm->test_some(pong_sends);
             }
         }
-    }
-    while (!send_hdr_futs.empty()) {
-        std::ignore = comm->test_some(send_hdr_futs);
-    }
-    while (!recv_hdr_futs.empty()) {
-        std::ignore = comm->test_some(recv_hdr_futs);
+        auto rt_end = Clock::now();
+        // Each rank measures its own RTT locally
+        rt_nc_total += (rt_end - rt_start);
     }
 
-    // Post payload recvs now that we know sizes
+    // Phase B (RTT compressed payload only): ping-pong of compressed buffers (no headers)
+    Duration rt_c_total{0};
     for (std::uint64_t op = 0; op < args.num_ops; ++op) {
-        for (std::size_t i = 0; i < data.items.size(); ++i) {
-            if (src != rank) {
-                // reuse comp_output_sizes[i] as expected size since peers are symmetric
-                if (comp_output_sizes[i] > 0) {
-                    auto res =
-                        br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                    auto buf = br->allocate(comp_output_sizes[i], stream, res);
-                    recv_cmp_futs.push_back(comm->recv(src, tag_payload, std::move(buf)));
-                }
+        bool initiator =
+            ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
+        auto rt_start = Clock::now();
+        if (initiator) {
+            std::vector<std::unique_ptr<Communicator::Future>> pong_recvs;
+            std::vector<std::unique_ptr<Communicator::Future>> ping_sends;
+            pong_recvs.reserve(data.items.size());
+            ping_sends.reserve(data.items.size());
+            for (std::size_t i = 0; i < data.items.size(); ++i) {
+                if (comp_output_sizes[i] == 0)
+                    continue;
+                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                auto recv_buf = br->allocate(comp_output_sizes[i], stream, res);
+                pong_recvs.push_back(comm->recv(src, tag_pong_c, std::move(recv_buf)));
             }
-        }
-        for (std::size_t i = 0; i < data.items.size(); ++i) {
-            if (dst != rank) {
-                // send compressed
-                if (comp_output_sizes[i] > 0) {
-                    auto tmp_buf =
-                        br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                    auto send_buf = br->allocate(comp_output_sizes[i], stream, tmp_buf);
-                    buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                    if (!send_buf->is_latest_write_done()) {
-                        send_buf->stream().synchronize();
-                    }
-                    send_cmp_futs.push_back(
-                        comm->send(std::move(send_buf), dst, tag_payload)
-                    );
-                }
+            for (std::size_t i = 0; i < data.items.size(); ++i) {
+                if (comp_output_sizes[i] == 0)
+                    continue;
+                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                auto send_buf = br->allocate(comp_output_sizes[i], stream, res);
+                buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
+                if (!send_buf->is_latest_write_done())
+                    send_buf->stream().synchronize();
+                ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_c));
+            }
+            while (!ping_sends.empty()) {
+                std::ignore = comm->test_some(ping_sends);
+            }
+            while (!pong_recvs.empty()) {
+                std::ignore = comm->test_some(pong_recvs);
+            }
+        } else {
+            std::vector<std::unique_ptr<Communicator::Future>> ping_recvs;
+            std::vector<std::unique_ptr<Communicator::Future>> pong_sends;
+            ping_recvs.reserve(data.items.size());
+            pong_sends.reserve(data.items.size());
+            for (std::size_t i = 0; i < data.items.size(); ++i) {
+                if (comp_output_sizes[i] == 0)
+                    continue;
+                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                auto recv_buf = br->allocate(comp_output_sizes[i], stream, res);
+                ping_recvs.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf)));
+            }
+            while (!ping_recvs.empty()) {
+                std::ignore = comm->test_some(ping_recvs);
+            }
+            for (std::size_t i = 0; i < data.items.size(); ++i) {
+                if (comp_output_sizes[i] == 0)
+                    continue;
+                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
+                auto send_buf = br->allocate(comp_output_sizes[i], stream, res);
+                buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
+                if (!send_buf->is_latest_write_done())
+                    send_buf->stream().synchronize();
+                pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_c));
+            }
+            while (!pong_sends.empty()) {
+                std::ignore = comm->test_some(pong_sends);
             }
         }
+        auto rt_end = Clock::now();
+        rt_c_total += (rt_end - rt_start);
     }
-    while (!send_cmp_futs.empty()) {
-        std::ignore = comm->test_some(send_cmp_futs);
-    }
-    auto b1 = Clock::now();
-    while (!recv_cmp_futs.empty()) {
-        std::ignore = comm->test_some(recv_cmp_futs);
-    }
-    auto b2 = Clock::now();
 
     // Decompress received buffers (simulate by decompressing our own produced outputs in
     // symmetric setup)
@@ -778,11 +778,8 @@ RunResult run_once(
 
     RunResult result{};
     result.times.compress_s = std::chrono::duration<double>(t1 - t0).count();
-    result.times.send_only_s = std::chrono::duration<double>(a1 - a0).count();
-    result.times.recv_only_s = std::chrono::duration<double>(a2 - a0).count();
-    result.times.comp_send_s = std::chrono::duration<double>(b1 - b0).count();
-    result.times.recv_decomp_s = std::chrono::duration<double>(b2 - b1).count()
-                                 + std::chrono::duration<double>(c1 - c0).count();
+    result.times.rt_nocomp_s = rt_nc_total.count();
+    result.times.rt_comp_s = rt_c_total.count();
     result.times.decompress_s = std::chrono::duration<double>(c1 - c0).count();
 
     // Use payload (device) bytes as the logical uncompressed size for throughput
@@ -906,14 +903,11 @@ int main(int argc, char** argv) {
     auto codec = make_codec(args.algo, args.params);
 
     // Runs
-    std::vector<double> compress_t, decompress_t, comp_send_t, recv_decomp_t, send_t,
-        recv_t;
+    std::vector<double> compress_t, decompress_t, rt_nc_t, rt_c_t;
     compress_t.reserve(args.num_runs);
     decompress_t.reserve(args.num_runs);
-    comp_send_t.reserve(args.num_runs);
-    recv_decomp_t.reserve(args.num_runs);
-    send_t.reserve(args.num_runs);
-    recv_t.reserve(args.num_runs);
+    rt_nc_t.reserve(args.num_runs);
+    rt_c_t.reserve(args.num_runs);
 
     std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops;
 
@@ -921,28 +915,29 @@ int main(int argc, char** argv) {
         if (i == args.num_warmups + args.num_runs - 1) {
             stats = std::make_shared<rapidsmpf::Statistics>(/* enable = */ true);
         }
-        auto rr = run_once(comm, args, stream, &br, stats, packed, *codec);
+        auto rr = run_once(comm, args, stream, &br, stats, packed, *codec, i);
 
         double cBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
                       / rr.times.compress_s;
         double dBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
                       / rr.times.decompress_s;
-        double csBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
-                       / rr.times.comp_send_s;
-        double rdBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
-                       / rr.times.recv_decomp_s;
-        double sBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
-                      / rr.times.send_only_s;
-        double rBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
-                      / rr.times.recv_only_s;
+        // Round-trip one-way throughput: 2 * bytes_one_way / RTT
+        double rt_nc_Bps =
+            rr.times.rt_nocomp_s > 0.0
+                ? (2.0 * static_cast<double>(rr.counts.logical_uncompressed_bytes))
+                      / rr.times.rt_nocomp_s
+                : 0.0;
+        double rt_c_Bps =
+            rr.times.rt_comp_s > 0.0
+                ? (2.0 * static_cast<double>(rr.counts.logical_uncompressed_bytes))
+                      / rr.times.rt_comp_s
+                : 0.0;
 
         std::stringstream ss;
         ss << "compress: " << format_nbytes(cBps)
            << "/s | decompress: " << format_nbytes(dBps)
-           << "/s | comp+send: " << format_nbytes(csBps)
-           << "/s | recv+decomp: " << format_nbytes(rdBps)
-           << "/s | send-only: " << format_nbytes(sBps)
-           << "/s | recv-only: " << format_nbytes(rBps) << "/s";
+           << "/s | rt(nocomp): " << format_nbytes(rt_nc_Bps)
+           << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s";
         if (i < args.num_warmups)
             ss << " (warmup run)";
         log.print(ss.str());
@@ -954,18 +949,8 @@ int main(int argc, char** argv) {
             decompress_t.push_back(
                 static_cast<double>(rr.counts.logical_uncompressed_bytes) / dBps
             );
-            comp_send_t.push_back(
-                static_cast<double>(rr.counts.logical_uncompressed_bytes) / csBps
-            );
-            recv_decomp_t.push_back(
-                static_cast<double>(rr.counts.logical_uncompressed_bytes) / rdBps
-            );
-            send_t.push_back(
-                static_cast<double>(rr.counts.logical_uncompressed_bytes) / sBps
-            );
-            recv_t.push_back(
-                static_cast<double>(rr.counts.logical_uncompressed_bytes) / rBps
-            );
+            rt_nc_t.push_back(rr.times.rt_nocomp_s);
+            rt_c_t.push_back(rr.times.rt_comp_s);
         }
     }
 
@@ -980,18 +965,17 @@ int main(int argc, char** argv) {
     if (!compress_t.empty()) {
         double mean_elapsed_c = harmonic_mean(compress_t);
         double mean_elapsed_d = harmonic_mean(decompress_t);
-        double mean_elapsed_cs = harmonic_mean(comp_send_t);
-        double mean_elapsed_rd = harmonic_mean(recv_decomp_t);
-        double mean_elapsed_s = harmonic_mean(send_t);
-        double mean_elapsed_r = harmonic_mean(recv_t);
+        double mean_rt_nc = harmonic_mean(rt_nc_t);
+        double mean_rt_c = harmonic_mean(rt_c_t);
 
         std::stringstream ss;
         ss << "means: compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s"
            << " | decompress: " << format_nbytes(logical_bytes / mean_elapsed_d) << "/s"
-           << " | comp+send: " << format_nbytes(logical_bytes / mean_elapsed_cs) << "/s"
-           << " | recv+decomp: " << format_nbytes(logical_bytes / mean_elapsed_rd) << "/s"
-           << " | send-only: " << format_nbytes(logical_bytes / mean_elapsed_s) << "/s"
-           << " | recv-only: " << format_nbytes(logical_bytes / mean_elapsed_r) << "/s";
+           << " | rt(nocomp): "
+           << format_nbytes((2.0 * static_cast<double>(logical_bytes)) / mean_rt_nc)
+           << "/s | rt(comp): "
+           << format_nbytes((2.0 * static_cast<double>(logical_bytes)) / mean_rt_c)
+           << "/s";
         log.print(ss.str());
     }
 

From 99d3ba94ddcab0d32ea2369eeb50d19d3d98d038 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 08:07:04 -0800
Subject: [PATCH 08/26] Fix roundtrip on >2 ranks

---
 cpp/benchmarks/bench_comp_comm.cpp | 217 +++++++++++++++++------------
 1 file changed, 127 insertions(+), 90 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index b28b2b5fb..6ff524ddc 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -614,65 +614,54 @@ RunResult run_once(
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto t1 = Clock::now();
 
-    // Phase A (RTT no compression): ping-pong per op
+    // Phase A (RTT no compression): ping-pong per op (sequential per item to avoid
+    // deadlocks)
     Duration rt_nc_total{0};
     for (std::uint64_t op = 0; op < args.num_ops; ++op) {
         bool initiator =
             ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
         auto rt_start = Clock::now();
         if (initiator) {
-            // Initiator: post pong recvs, then ping sends
-            std::vector<std::unique_ptr<Communicator::Future>> pong_recvs;
-            std::vector<std::unique_ptr<Communicator::Future>> ping_sends;
-            pong_recvs.reserve(nocomp_payloads.size());
-            ping_sends.reserve(nocomp_payloads.size());
             for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto res =
+                // post pong recv and send ping, then wait both
+                auto res_r =
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
-                pong_recvs.push_back(comm->recv(src, tag_pong_nc, std::move(recv_buf)));
-            }
-            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto res =
+                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r);
+                std::vector<std::unique_ptr<Communicator::Future>> futs;
+                futs.push_back(comm->recv(dst, tag_pong_nc, std::move(recv_buf)));
+                auto res_s =
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
+                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s);
                 buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
                 if (!send_buf->is_latest_write_done())
                     send_buf->stream().synchronize();
-                ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc));
-            }
-            while (!ping_sends.empty()) {
-                std::ignore = comm->test_some(ping_sends);
-            }
-            while (!pong_recvs.empty()) {
-                std::ignore = comm->test_some(pong_recvs);
+                futs.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc));
+                while (!futs.empty()) {
+                    std::ignore = comm->test_some(futs);
+                }
             }
         } else {
-            // Responder: post ping recvs, then pong sends
-            std::vector<std::unique_ptr<Communicator::Future>> ping_recvs;
-            std::vector<std::unique_ptr<Communicator::Future>> pong_sends;
-            ping_recvs.reserve(nocomp_payloads.size());
-            pong_sends.reserve(nocomp_payloads.size());
+            // Responder: for each item, recv ping then send pong
             for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto res =
+                auto res_r =
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
-                ping_recvs.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf)));
-            }
-            while (!ping_recvs.empty()) {
-                std::ignore = comm->test_some(ping_recvs);
-            }
-            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto res =
+                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r);
+                std::vector<std::unique_ptr<Communicator::Future>> rf;
+                rf.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf)));
+                while (!rf.empty()) {
+                    std::ignore = comm->test_some(rf);
+                }
+                auto res_s =
                     br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res);
+                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s);
                 buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
                 if (!send_buf->is_latest_write_done())
                     send_buf->stream().synchronize();
-                pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_nc));
-            }
-            while (!pong_sends.empty()) {
-                std::ignore = comm->test_some(pong_sends);
+                std::vector<std::unique_ptr<Communicator::Future>> sf;
+                sf.push_back(comm->send(std::move(send_buf), src, tag_pong_nc));
+                while (!sf.empty()) {
+                    std::ignore = comm->test_some(sf);
+                }
             }
         }
         auto rt_end = Clock::now();
@@ -680,67 +669,115 @@ RunResult run_once(
         rt_nc_total += (rt_end - rt_start);
     }
 
-    // Phase B (RTT compressed payload only): ping-pong of compressed buffers (no headers)
+    // Phase B (RTT compressed payload only): ping-pong with size headers per item
     Duration rt_c_total{0};
     for (std::uint64_t op = 0; op < args.num_ops; ++op) {
         bool initiator =
             ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
         auto rt_start = Clock::now();
         if (initiator) {
-            std::vector<std::unique_ptr<Communicator::Future>> pong_recvs;
-            std::vector<std::unique_ptr<Communicator::Future>> ping_sends;
-            pong_recvs.reserve(data.items.size());
-            ping_sends.reserve(data.items.size());
             for (std::size_t i = 0; i < data.items.size(); ++i) {
-                if (comp_output_sizes[i] == 0)
-                    continue;
-                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                auto recv_buf = br->allocate(comp_output_sizes[i], stream, res);
-                pong_recvs.push_back(comm->recv(src, tag_pong_c, std::move(recv_buf)));
-            }
-            for (std::size_t i = 0; i < data.items.size(); ++i) {
-                if (comp_output_sizes[i] == 0)
-                    continue;
-                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                auto send_buf = br->allocate(comp_output_sizes[i], stream, res);
-                buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                if (!send_buf->is_latest_write_done())
-                    send_buf->stream().synchronize();
-                ping_sends.push_back(comm->send(std::move(send_buf), dst, tag_ping_c));
-            }
-            while (!ping_sends.empty()) {
-                std::ignore = comm->test_some(ping_sends);
-            }
-            while (!pong_recvs.empty()) {
-                std::ignore = comm->test_some(pong_recvs);
+                // Send header with size to dst
+                std::uint64_t sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
+                auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+                auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h);
+                hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
+                    std::memcpy(p, &sz, sizeof(std::uint64_t));
+                });
+                if (!hdr->is_latest_write_done())
+                    hdr->stream().synchronize();
+                std::vector<std::unique_ptr<Communicator::Future>> hf;
+                hf.push_back(comm->send(std::move(hdr), dst, tag_ping_c));
+                while (!hf.empty()) {
+                    std::ignore = comm->test_some(hf);
+                }
+                // Receive pong header with size from src (blocking wait)
+                auto res_hr =
+                    br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+                auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr);
+                auto fut_hdr = comm->recv(dst, tag_pong_c, std::move(hdr_r));
+                auto hdr_buf = comm->wait(std::move(fut_hdr));
+                auto* p = hdr_buf->exclusive_data_access();
+                std::uint64_t pong_sz = 0;
+                std::memcpy(&pong_sz, p, sizeof(std::uint64_t));
+                hdr_buf->unlock();
+                // Send ping payload (if any)
+                if (sz > 0) {
+                    auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE);
+                    auto send_buf = br->allocate(sz, stream, res_s);
+                    if (comp_output_sizes[i] > 0) {
+                        buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
+                    }
+                    if (!send_buf->is_latest_write_done())
+                        send_buf->stream().synchronize();
+                    std::vector<std::unique_ptr<Communicator::Future>> sf;
+                    sf.push_back(comm->send(std::move(send_buf), dst, tag_ping_c));
+                    while (!sf.empty()) {
+                        std::ignore = comm->test_some(sf);
+                    }
+                }
+                // Receive pong payload of announced size
+                if (pong_sz > 0) {
+                    auto res_r = br->reserve_or_fail(pong_sz, MemoryType::DEVICE);
+                    auto recv_buf = br->allocate(pong_sz, stream, res_r);
+                    std::vector<std::unique_ptr<Communicator::Future>> rf;
+                    rf.push_back(comm->recv(dst, tag_pong_c, std::move(recv_buf)));
+                    while (!rf.empty()) {
+                        std::ignore = comm->test_some(rf);
+                    }
+                }
             }
         } else {
-            std::vector<std::unique_ptr<Communicator::Future>> ping_recvs;
-            std::vector<std::unique_ptr<Communicator::Future>> pong_sends;
-            ping_recvs.reserve(data.items.size());
-            pong_sends.reserve(data.items.size());
             for (std::size_t i = 0; i < data.items.size(); ++i) {
-                if (comp_output_sizes[i] == 0)
-                    continue;
-                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                auto recv_buf = br->allocate(comp_output_sizes[i], stream, res);
-                ping_recvs.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf)));
-            }
-            while (!ping_recvs.empty()) {
-                std::ignore = comm->test_some(ping_recvs);
-            }
-            for (std::size_t i = 0; i < data.items.size(); ++i) {
-                if (comp_output_sizes[i] == 0)
-                    continue;
-                auto res = br->reserve_or_fail(comp_output_sizes[i], MemoryType::DEVICE);
-                auto send_buf = br->allocate(comp_output_sizes[i], stream, res);
-                buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                if (!send_buf->is_latest_write_done())
-                    send_buf->stream().synchronize();
-                pong_sends.push_back(comm->send(std::move(send_buf), src, tag_pong_c));
-            }
-            while (!pong_sends.empty()) {
-                std::ignore = comm->test_some(pong_sends);
+                // Receive ping header with size (blocking wait)
+                auto res_hr =
+                    br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+                auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr);
+                auto fut_hdr = comm->recv(src, tag_ping_c, std::move(hdr_r));
+                auto hdr_buf = comm->wait(std::move(fut_hdr));
+                auto* p = hdr_buf->exclusive_data_access();
+                std::uint64_t ping_sz = 0;
+                std::memcpy(&ping_sz, p, sizeof(std::uint64_t));
+                hdr_buf->unlock();
+                // Send pong header with our size
+                std::uint64_t sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
+                auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+                auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h);
+                hdr->write_access([&](std::byte* q, rmm::cuda_stream_view) {
+                    std::memcpy(q, &sz, sizeof(std::uint64_t));
+                });
+                if (!hdr->is_latest_write_done())
+                    hdr->stream().synchronize();
+                std::vector<std::unique_ptr<Communicator::Future>> hf;
+                hf.push_back(comm->send(std::move(hdr), src, tag_pong_c));
+                while (!hf.empty()) {
+                    std::ignore = comm->test_some(hf);
+                }
+                // Receive ping payload
+                if (ping_sz > 0) {
+                    auto res_r = br->reserve_or_fail(ping_sz, MemoryType::DEVICE);
+                    auto recv_buf = br->allocate(ping_sz, stream, res_r);
+                    std::vector<std::unique_ptr<Communicator::Future>> rf;
+                    rf.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf)));
+                    while (!rf.empty()) {
+                        std::ignore = comm->test_some(rf);
+                    }
+                }
+                // Send pong payload
+                if (sz > 0) {
+                    auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE);
+                    auto send_buf = br->allocate(sz, stream, res_s);
+                    if (comp_output_sizes[i] > 0) {
+                        buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
+                    }
+                    if (!send_buf->is_latest_write_done())
+                        send_buf->stream().synchronize();
+                    std::vector<std::unique_ptr<Communicator::Future>> sf;
+                    sf.push_back(comm->send(std::move(send_buf), src, tag_pong_c));
+                    while (!sf.empty()) {
+                        std::ignore = comm->test_some(sf);
+                    }
+                }
             }
         }
         auto rt_end = Clock::now();

From 99245f3bc42776520d61a0d73d0f22b2665fe8cf Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 10:17:23 -0800
Subject: [PATCH 09/26] Reduce code duplication

---
 cpp/benchmarks/bench_comp_comm.cpp | 203 +++++++++++++++--------------
 1 file changed, 107 insertions(+), 96 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 6ff524ddc..b4f90cd12 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -332,6 +332,93 @@ std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p) {
     }
 }
 
+static inline void ensure_ready(Buffer& buf) {
+    if (!buf.is_latest_write_done()) {
+        buf.stream().synchronize();
+    }
+}
+
+static inline std::unique_ptr<Buffer> alloc_device(
+    BufferResource* br, rmm::cuda_stream_view stream, std::size_t size
+) {
+    auto res = br->reserve_or_fail(size, MemoryType::DEVICE);
+    return br->allocate(size, stream, res);
+}
+
+static inline std::unique_ptr<Buffer> alloc_and_copy_device(
+    BufferResource* br, rmm::cuda_stream_view stream, Buffer const& src
+) {
+    auto out = alloc_device(br, stream, src.size);
+    buffer_copy(*out, src, src.size);
+    return out;
+}
+
+static inline void send_blocking(
+    std::shared_ptr<Communicator> const& comm,
+    std::unique_ptr<Buffer> buf,
+    Rank to,
+    Tag tag
+) {
+    ensure_ready(*buf);
+    std::vector<std::unique_ptr<Communicator::Future>> futs;
+    futs.push_back(comm->send(std::move(buf), to, tag));
+    while (!futs.empty()) {
+        std::ignore = comm->test_some(futs);
+    }
+}
+
+static inline std::unique_ptr<Buffer> recv_blocking(
+    std::shared_ptr<Communicator> const& comm,
+    BufferResource* br,
+    rmm::cuda_stream_view stream,
+    Rank from,
+    Tag tag,
+    std::size_t size
+) {
+    auto buf = alloc_device(br, stream, size);
+    auto fut = comm->recv(from, tag, std::move(buf));
+    return comm->wait(std::move(fut));
+}
+
+static inline void send_u64_header_blocking(
+    std::shared_ptr<Communicator> const& comm,
+    BufferResource* br,
+    rmm::cuda_stream_view stream,
+    Rank to,
+    Tag tag,
+    std::uint64_t value
+) {
+    auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+    auto hdr = br->allocate(sizeof(std::uint64_t), stream, res);
+    hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
+        std::memcpy(p, &value, sizeof(std::uint64_t));
+    });
+    ensure_ready(*hdr);
+    std::vector<std::unique_ptr<Communicator::Future>> futs;
+    futs.push_back(comm->send(std::move(hdr), to, tag));
+    while (!futs.empty()) {
+        std::ignore = comm->test_some(futs);
+    }
+}
+
+static inline std::uint64_t recv_u64_header_blocking(
+    std::shared_ptr<Communicator> const& comm,
+    BufferResource* br,
+    rmm::cuda_stream_view stream,
+    Rank from,
+    Tag tag
+) {
+    auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+    auto hdr = br->allocate(sizeof(std::uint64_t), stream, res);
+    auto fut = comm->recv(from, tag, std::move(hdr));
+    auto buf = comm->wait(std::move(fut));
+    auto* p = buf->exclusive_data_access();
+    std::uint64_t value = 0;
+    std::memcpy(&value, p, sizeof(std::uint64_t));
+    buf->unlock();
+    return value;
+}
+
 // Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData
 static std::unique_ptr<PackedData> pack_table_to_packed(
     cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br
@@ -624,17 +711,11 @@ RunResult run_once(
         if (initiator) {
             for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
                 // post pong recv and send ping, then wait both
-                auto res_r =
-                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r);
+                auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size);
                 std::vector<std::unique_ptr<Communicator::Future>> futs;
                 futs.push_back(comm->recv(dst, tag_pong_nc, std::move(recv_buf)));
-                auto res_s =
-                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s);
-                buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
-                if (!send_buf->is_latest_write_done())
-                    send_buf->stream().synchronize();
+                auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
+                ensure_ready(*send_buf);
                 futs.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc));
                 while (!futs.empty()) {
                     std::ignore = comm->test_some(futs);
@@ -643,20 +724,14 @@ RunResult run_once(
         } else {
             // Responder: for each item, recv ping then send pong
             for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto res_r =
-                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto recv_buf = br->allocate(nocomp_payloads[i]->size, stream, res_r);
+                auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size);
                 std::vector<std::unique_ptr<Communicator::Future>> rf;
                 rf.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf)));
                 while (!rf.empty()) {
                     std::ignore = comm->test_some(rf);
                 }
-                auto res_s =
-                    br->reserve_or_fail(nocomp_payloads[i]->size, MemoryType::DEVICE);
-                auto send_buf = br->allocate(nocomp_payloads[i]->size, stream, res_s);
-                buffer_copy(*send_buf, *nocomp_payloads[i], nocomp_payloads[i]->size);
-                if (!send_buf->is_latest_write_done())
-                    send_buf->stream().synchronize();
+                auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
+                ensure_ready(*send_buf);
                 std::vector<std::unique_ptr<Communicator::Future>> sf;
                 sf.push_back(comm->send(std::move(send_buf), src, tag_pong_nc));
                 while (!sf.empty()) {
@@ -679,104 +754,40 @@ RunResult run_once(
             for (std::size_t i = 0; i < data.items.size(); ++i) {
                 // Send header with size to dst
                 std::uint64_t sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
-                auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-                auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h);
-                hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
-                    std::memcpy(p, &sz, sizeof(std::uint64_t));
-                });
-                if (!hdr->is_latest_write_done())
-                    hdr->stream().synchronize();
-                std::vector<std::unique_ptr<Communicator::Future>> hf;
-                hf.push_back(comm->send(std::move(hdr), dst, tag_ping_c));
-                while (!hf.empty()) {
-                    std::ignore = comm->test_some(hf);
-                }
+                send_u64_header_blocking(comm, br, stream, dst, tag_ping_c, sz);
                 // Receive pong header with size from src (blocking wait)
-                auto res_hr =
-                    br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-                auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr);
-                auto fut_hdr = comm->recv(dst, tag_pong_c, std::move(hdr_r));
-                auto hdr_buf = comm->wait(std::move(fut_hdr));
-                auto* p = hdr_buf->exclusive_data_access();
-                std::uint64_t pong_sz = 0;
-                std::memcpy(&pong_sz, p, sizeof(std::uint64_t));
-                hdr_buf->unlock();
+                std::uint64_t pong_sz =
+                    recv_u64_header_blocking(comm, br, stream, dst, tag_pong_c);
                 // Send ping payload (if any)
                 if (sz > 0) {
-                    auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE);
-                    auto send_buf = br->allocate(sz, stream, res_s);
-                    if (comp_output_sizes[i] > 0) {
+                    auto send_buf = alloc_device(br, stream, sz);
+                    if (comp_output_sizes[i] > 0)
                         buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                    }
-                    if (!send_buf->is_latest_write_done())
-                        send_buf->stream().synchronize();
-                    std::vector<std::unique_ptr<Communicator::Future>> sf;
-                    sf.push_back(comm->send(std::move(send_buf), dst, tag_ping_c));
-                    while (!sf.empty()) {
-                        std::ignore = comm->test_some(sf);
-                    }
+                    send_blocking(comm, std::move(send_buf), dst, tag_ping_c);
                 }
                 // Receive pong payload of announced size
                 if (pong_sz > 0) {
-                    auto res_r = br->reserve_or_fail(pong_sz, MemoryType::DEVICE);
-                    auto recv_buf = br->allocate(pong_sz, stream, res_r);
-                    std::vector<std::unique_ptr<Communicator::Future>> rf;
-                    rf.push_back(comm->recv(dst, tag_pong_c, std::move(recv_buf)));
-                    while (!rf.empty()) {
-                        std::ignore = comm->test_some(rf);
-                    }
+                    (void)recv_blocking(comm, br, stream, dst, tag_pong_c, pong_sz);
                 }
             }
         } else {
             for (std::size_t i = 0; i < data.items.size(); ++i) {
                 // Receive ping header with size (blocking wait)
-                auto res_hr =
-                    br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-                auto hdr_r = br->allocate(sizeof(std::uint64_t), stream, res_hr);
-                auto fut_hdr = comm->recv(src, tag_ping_c, std::move(hdr_r));
-                auto hdr_buf = comm->wait(std::move(fut_hdr));
-                auto* p = hdr_buf->exclusive_data_access();
-                std::uint64_t ping_sz = 0;
-                std::memcpy(&ping_sz, p, sizeof(std::uint64_t));
-                hdr_buf->unlock();
+                std::uint64_t ping_sz =
+                    recv_u64_header_blocking(comm, br, stream, src, tag_ping_c);
                 // Send pong header with our size
                 std::uint64_t sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
-                auto res_h = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-                auto hdr = br->allocate(sizeof(std::uint64_t), stream, res_h);
-                hdr->write_access([&](std::byte* q, rmm::cuda_stream_view) {
-                    std::memcpy(q, &sz, sizeof(std::uint64_t));
-                });
-                if (!hdr->is_latest_write_done())
-                    hdr->stream().synchronize();
-                std::vector<std::unique_ptr<Communicator::Future>> hf;
-                hf.push_back(comm->send(std::move(hdr), src, tag_pong_c));
-                while (!hf.empty()) {
-                    std::ignore = comm->test_some(hf);
-                }
+                send_u64_header_blocking(comm, br, stream, src, tag_pong_c, sz);
                 // Receive ping payload
                 if (ping_sz > 0) {
-                    auto res_r = br->reserve_or_fail(ping_sz, MemoryType::DEVICE);
-                    auto recv_buf = br->allocate(ping_sz, stream, res_r);
-                    std::vector<std::unique_ptr<Communicator::Future>> rf;
-                    rf.push_back(comm->recv(src, tag_ping_c, std::move(recv_buf)));
-                    while (!rf.empty()) {
-                        std::ignore = comm->test_some(rf);
-                    }
+                    (void)recv_blocking(comm, br, stream, src, tag_ping_c, ping_sz);
                 }
                 // Send pong payload
                 if (sz > 0) {
-                    auto res_s = br->reserve_or_fail(sz, MemoryType::DEVICE);
-                    auto send_buf = br->allocate(sz, stream, res_s);
-                    if (comp_output_sizes[i] > 0) {
+                    auto send_buf = alloc_device(br, stream, sz);
+                    if (comp_output_sizes[i] > 0)
                         buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                    }
-                    if (!send_buf->is_latest_write_done())
-                        send_buf->stream().synchronize();
-                    std::vector<std::unique_ptr<Communicator::Future>> sf;
-                    sf.push_back(comm->send(std::move(send_buf), src, tag_pong_c));
-                    while (!sf.empty()) {
-                        std::ignore = comm->test_some(sf);
-                    }
+                    send_blocking(comm, std::move(send_buf), src, tag_pong_c);
                 }
             }
         }

From 6c61c3637765080840b6cee22c1125380324a81e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 10:27:41 -0800
Subject: [PATCH 10/26] Make uncompressed transfer block identical

---
 cpp/benchmarks/bench_comp_comm.cpp | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index b4f90cd12..975ab2327 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -708,36 +708,23 @@ RunResult run_once(
         bool initiator =
             ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
         auto rt_start = Clock::now();
-        if (initiator) {
+        auto run_ping_pong_nc = [&](Rank peer, Tag recv_tag, Tag send_tag) {
             for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                // post pong recv and send ping, then wait both
                 auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size);
                 std::vector<std::unique_ptr<Communicator::Future>> futs;
-                futs.push_back(comm->recv(dst, tag_pong_nc, std::move(recv_buf)));
+                futs.push_back(comm->recv(peer, recv_tag, std::move(recv_buf)));
                 auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
                 ensure_ready(*send_buf);
-                futs.push_back(comm->send(std::move(send_buf), dst, tag_ping_nc));
+                futs.push_back(comm->send(std::move(send_buf), peer, send_tag));
                 while (!futs.empty()) {
                     std::ignore = comm->test_some(futs);
                 }
             }
+        };
+        if (initiator) {
+            run_ping_pong_nc(dst, tag_pong_nc, tag_ping_nc);
         } else {
-            // Responder: for each item, recv ping then send pong
-            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size);
-                std::vector<std::unique_ptr<Communicator::Future>> rf;
-                rf.push_back(comm->recv(src, tag_ping_nc, std::move(recv_buf)));
-                while (!rf.empty()) {
-                    std::ignore = comm->test_some(rf);
-                }
-                auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
-                ensure_ready(*send_buf);
-                std::vector<std::unique_ptr<Communicator::Future>> sf;
-                sf.push_back(comm->send(std::move(send_buf), src, tag_pong_nc));
-                while (!sf.empty()) {
-                    std::ignore = comm->test_some(sf);
-                }
-            }
+            run_ping_pong_nc(src, tag_ping_nc, tag_pong_nc);
         }
         auto rt_end = Clock::now();
         // Each rank measures its own RTT locally

From 525541e528339d251a1cae2027d3e7dac5e8fe87 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 11:21:51 -0800
Subject: [PATCH 11/26] Implement functions to exchange header and payload

---
 cpp/benchmarks/bench_comp_comm.cpp | 201 +++++++++++++----------------
 1 file changed, 93 insertions(+), 108 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 975ab2327..b8684f25f 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -353,70 +353,68 @@ static inline std::unique_ptr<Buffer> alloc_and_copy_device(
     return out;
 }
 
-static inline void send_blocking(
-    std::shared_ptr<Communicator> const& comm,
-    std::unique_ptr<Buffer> buf,
-    Rank to,
-    Tag tag
-) {
-    ensure_ready(*buf);
-    std::vector<std::unique_ptr<Communicator::Future>> futs;
-    futs.push_back(comm->send(std::move(buf), to, tag));
-    while (!futs.empty()) {
-        std::ignore = comm->test_some(futs);
-    }
-}
-
-static inline std::unique_ptr<Buffer> recv_blocking(
-    std::shared_ptr<Communicator> const& comm,
-    BufferResource* br,
-    rmm::cuda_stream_view stream,
-    Rank from,
-    Tag tag,
-    std::size_t size
-) {
-    auto buf = alloc_device(br, stream, size);
-    auto fut = comm->recv(from, tag, std::move(buf));
-    return comm->wait(std::move(fut));
-}
-
-static inline void send_u64_header_blocking(
+// Non-blocking helpers to exchange headers and payloads concurrently.
+static inline std::uint64_t exchange_u64_header(
     std::shared_ptr<Communicator> const& comm,
     BufferResource* br,
     rmm::cuda_stream_view stream,
-    Rank to,
-    Tag tag,
-    std::uint64_t value
+    Rank peer,
+    Tag send_tag,
+    Tag recv_tag,
+    std::uint64_t send_value
 ) {
-    auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-    auto hdr = br->allocate(sizeof(std::uint64_t), stream, res);
-    hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
-        std::memcpy(p, &value, sizeof(std::uint64_t));
+    // Post header send
+    auto send_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+    auto send_hdr = br->allocate(sizeof(std::uint64_t), stream, send_hdr_res);
+    send_hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
+        std::memcpy(p, &send_value, sizeof(std::uint64_t));
     });
-    ensure_ready(*hdr);
-    std::vector<std::unique_ptr<Communicator::Future>> futs;
-    futs.push_back(comm->send(std::move(hdr), to, tag));
-    while (!futs.empty()) {
-        std::ignore = comm->test_some(futs);
+    ensure_ready(*send_hdr);
+    auto send_hdr_fut = comm->send(std::move(send_hdr), peer, send_tag);
+    // Post header recv
+    auto recv_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+    auto recv_hdr = br->allocate(sizeof(std::uint64_t), stream, recv_hdr_res);
+    ensure_ready(*recv_hdr);
+    auto recv_hdr_fut = comm->recv(peer, recv_tag, std::move(recv_hdr));
+    // Wait recv, read value, then ensure send completion
+    auto recv_hdr_buf = comm->wait(std::move(recv_hdr_fut));
+    std::uint64_t recv_value = 0;
+    {
+        auto* p = recv_hdr_buf->exclusive_data_access();
+        std::memcpy(&recv_value, p, sizeof(std::uint64_t));
+        recv_hdr_buf->unlock();
     }
+    std::ignore = comm->wait(std::move(send_hdr_fut));
+    return recv_value;
 }
 
-static inline std::uint64_t recv_u64_header_blocking(
+static inline void exchange_payload(
     std::shared_ptr<Communicator> const& comm,
     BufferResource* br,
     rmm::cuda_stream_view stream,
-    Rank from,
-    Tag tag
+    Rank peer,
+    Tag send_tag,
+    Tag recv_tag,
+    std::unique_ptr<Buffer> send_buf,  // may be null if no data to send
+    std::size_t recv_size  // may be zero if no data to recv
 ) {
-    auto res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-    auto hdr = br->allocate(sizeof(std::uint64_t), stream, res);
-    auto fut = comm->recv(from, tag, std::move(hdr));
-    auto buf = comm->wait(std::move(fut));
-    auto* p = buf->exclusive_data_access();
-    std::uint64_t value = 0;
-    std::memcpy(&value, p, sizeof(std::uint64_t));
-    buf->unlock();
-    return value;
+    std::unique_ptr<Communicator::Future> data_send_fut;
+    std::unique_ptr<Communicator::Future> data_recv_fut;
+    if (recv_size > 0) {
+        auto recv_buf = alloc_device(br, stream, recv_size);
+        ensure_ready(*recv_buf);
+        data_recv_fut = comm->recv(peer, recv_tag, std::move(recv_buf));
+    }
+    if (send_buf && send_buf->size > 0) {
+        ensure_ready(*send_buf);
+        data_send_fut = comm->send(std::move(send_buf), peer, send_tag);
+    }
+    if (data_recv_fut) {
+        (void)comm->wait(std::move(data_recv_fut));
+    }
+    if (data_send_fut) {
+        std::ignore = comm->wait(std::move(data_send_fut));
+    }
 }
 
 // Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData
@@ -708,23 +706,23 @@ RunResult run_once(
         bool initiator =
             ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
         auto rt_start = Clock::now();
-        auto run_ping_pong_nc = [&](Rank peer, Tag recv_tag, Tag send_tag) {
-            for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-                auto recv_buf = alloc_device(br, stream, nocomp_payloads[i]->size);
-                std::vector<std::unique_ptr<Communicator::Future>> futs;
-                futs.push_back(comm->recv(peer, recv_tag, std::move(recv_buf)));
-                auto send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
-                ensure_ready(*send_buf);
-                futs.push_back(comm->send(std::move(send_buf), peer, send_tag));
-                while (!futs.empty()) {
-                    std::ignore = comm->test_some(futs);
-                }
-            }
-        };
-        if (initiator) {
-            run_ping_pong_nc(dst, tag_pong_nc, tag_ping_nc);
-        } else {
-            run_ping_pong_nc(src, tag_ping_nc, tag_pong_nc);
+        Rank peer = initiator ? dst : src;
+        Tag send_tag_nc = initiator ? tag_ping_nc : tag_pong_nc;
+        Tag recv_tag_nc = initiator ? tag_pong_nc : tag_ping_nc;
+        for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
+            std::size_t recv_size = nocomp_payloads[i]->size;
+            std::unique_ptr<Buffer> send_buf =
+                alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
+            exchange_payload(
+                comm,
+                br,
+                stream,
+                peer,
+                send_tag_nc,
+                recv_tag_nc,
+                std::move(send_buf),
+                recv_size
+            );
         }
         auto rt_end = Clock::now();
         // Each rank measures its own RTT locally
@@ -737,46 +735,33 @@ RunResult run_once(
         bool initiator =
             ((static_cast<std::uint64_t>(rank) + op + run_index) % 2ull) == 0ull;
         auto rt_start = Clock::now();
-        if (initiator) {
-            for (std::size_t i = 0; i < data.items.size(); ++i) {
-                // Send header with size to dst
-                std::uint64_t sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
-                send_u64_header_blocking(comm, br, stream, dst, tag_ping_c, sz);
-                // Receive pong header with size from src (blocking wait)
-                std::uint64_t pong_sz =
-                    recv_u64_header_blocking(comm, br, stream, dst, tag_pong_c);
-                // Send ping payload (if any)
-                if (sz > 0) {
-                    auto send_buf = alloc_device(br, stream, sz);
-                    if (comp_output_sizes[i] > 0)
-                        buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                    send_blocking(comm, std::move(send_buf), dst, tag_ping_c);
-                }
-                // Receive pong payload of announced size
-                if (pong_sz > 0) {
-                    (void)recv_blocking(comm, br, stream, dst, tag_pong_c, pong_sz);
-                }
-            }
-        } else {
-            for (std::size_t i = 0; i < data.items.size(); ++i) {
-                // Receive ping header with size (blocking wait)
-                std::uint64_t ping_sz =
-                    recv_u64_header_blocking(comm, br, stream, src, tag_ping_c);
-                // Send pong header with our size
-                std::uint64_t sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
-                send_u64_header_blocking(comm, br, stream, src, tag_pong_c, sz);
-                // Receive ping payload
-                if (ping_sz > 0) {
-                    (void)recv_blocking(comm, br, stream, src, tag_ping_c, ping_sz);
-                }
-                // Send pong payload
-                if (sz > 0) {
-                    auto send_buf = alloc_device(br, stream, sz);
-                    if (comp_output_sizes[i] > 0)
-                        buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
-                    send_blocking(comm, std::move(send_buf), src, tag_pong_c);
-                }
+        Rank peer = initiator ? dst : src;
+        Tag send_tag_c = initiator ? tag_ping_c : tag_pong_c;
+        Tag recv_tag_c = initiator ? tag_pong_c : tag_ping_c;
+        for (std::size_t i = 0; i < data.items.size(); ++i) {
+            // Header exchange: send our size, receive peer size
+            std::uint64_t local_sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
+            std::uint64_t remote_sz = exchange_u64_header(
+                comm, br, stream, peer, send_tag_c, recv_tag_c, local_sz
+            );
+            // Prepare send buffer if needed
+            std::unique_ptr<Buffer> send_buf;
+            if (local_sz > 0) {
+                send_buf = alloc_device(br, stream, local_sz);
+                if (comp_output_sizes[i] > 0)
+                    buffer_copy(*send_buf, *comp_outputs[i], comp_output_sizes[i]);
             }
+            // Payload exchange using the same tags
+            exchange_payload(
+                comm,
+                br,
+                stream,
+                peer,
+                send_tag_c,
+                recv_tag_c,
+                std::move(send_buf),
+                static_cast<std::size_t>(remote_sz)
+            );
         }
         auto rt_end = Clock::now();
         rt_c_total += (rt_end - rt_start);

From fbec2a24a750a48b1c3f74c916df717bbf396fc9 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 11:40:30 -0800
Subject: [PATCH 12/26] Print compression ratio

---
 cpp/benchmarks/bench_comp_comm.cpp | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index b8684f25f..ce48b2dfe 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <cstring>
 #include <functional>
+#include <iomanip>
 #include <numeric>
 #include <optional>
 #include <sstream>
@@ -930,6 +931,7 @@ int main(int argc, char** argv) {
     rt_c_t.reserve(args.num_runs);
 
     std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops;
+    std::size_t logical_compressed_bytes_last = 0;
 
     for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) {
         if (i == args.num_warmups + args.num_runs - 1) {
@@ -941,6 +943,11 @@ int main(int argc, char** argv) {
                       / rr.times.compress_s;
         double dBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
                       / rr.times.decompress_s;
+        logical_compressed_bytes_last = rr.counts.logical_compressed_bytes;
+        double ratio = (rr.counts.logical_compressed_bytes > 0)
+                           ? static_cast<double>(rr.counts.logical_uncompressed_bytes)
+                                 / static_cast<double>(rr.counts.logical_compressed_bytes)
+                           : 0.0;
         // Round-trip one-way throughput: 2 * bytes_one_way / RTT
         double rt_nc_Bps =
             rr.times.rt_nocomp_s > 0.0
@@ -957,7 +964,8 @@ int main(int argc, char** argv) {
         ss << "compress: " << format_nbytes(cBps)
            << "/s | decompress: " << format_nbytes(dBps)
            << "/s | rt(nocomp): " << format_nbytes(rt_nc_Bps)
-           << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s";
+           << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s"
+           << " | comp ratio: " << std::fixed << std::setprecision(2) << ratio << "x";
         if (i < args.num_warmups)
             ss << " (warmup run)";
         log.print(ss.str());
@@ -996,6 +1004,14 @@ int main(int argc, char** argv) {
            << "/s | rt(comp): "
            << format_nbytes((2.0 * static_cast<double>(logical_bytes)) / mean_rt_c)
            << "/s";
+        if (logical_compressed_bytes_last > 0) {
+            double mean_ratio = static_cast<double>(logical_bytes)
+                                / static_cast<double>(logical_compressed_bytes_last);
+            ss << " | comp ratio: " << std::fixed << std::setprecision(2) << mean_ratio
+               << "x";
+        } else {
+            ss << " | comp ratio: n/a";
+        }
         log.print(ss.str());
     }
 

From a1e79923fe29f6c376011f72cf0defdfad3a1463 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Nov 2025 11:56:59 -0800
Subject: [PATCH 13/26] Compressed time includes compression/decompression

---
 cpp/benchmarks/bench_comp_comm.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index ce48b2dfe..26ffce259 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -954,10 +954,12 @@ int main(int argc, char** argv) {
                 ? (2.0 * static_cast<double>(rr.counts.logical_uncompressed_bytes))
                       / rr.times.rt_nocomp_s
                 : 0.0;
+        double const rt_comp_total_s =
+            rr.times.compress_s + rr.times.rt_comp_s + rr.times.decompress_s;
         double rt_c_Bps =
-            rr.times.rt_comp_s > 0.0
+            rt_comp_total_s > 0.0
                 ? (2.0 * static_cast<double>(rr.counts.logical_uncompressed_bytes))
-                      / rr.times.rt_comp_s
+                      / rt_comp_total_s
                 : 0.0;
 
         std::stringstream ss;
@@ -978,7 +980,7 @@ int main(int argc, char** argv) {
                 static_cast<double>(rr.counts.logical_uncompressed_bytes) / dBps
             );
             rt_nc_t.push_back(rr.times.rt_nocomp_s);
-            rt_c_t.push_back(rr.times.rt_comp_s);
+            rt_c_t.push_back(rt_comp_total_s);
         }
     }
 

From b0951a147bfc746f8de672e87020216f5e963bd4 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 05:52:09 -0800
Subject: [PATCH 14/26] Fix assumed file sizes

---
 cpp/benchmarks/bench_comp_comm.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 26ffce259..c025765e6 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -711,9 +711,15 @@ RunResult run_once(
         Tag send_tag_nc = initiator ? tag_ping_nc : tag_pong_nc;
         Tag recv_tag_nc = initiator ? tag_pong_nc : tag_ping_nc;
         for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-            std::size_t recv_size = nocomp_payloads[i]->size;
-            std::unique_ptr<Buffer> send_buf =
-                alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
+            // Exchange payload sizes to avoid assuming symmetric sizes across ranks
+            std::uint64_t local_sz = static_cast<std::uint64_t>(nocomp_payloads[i]->size);
+            std::uint64_t remote_sz = exchange_u64_header(
+                comm, br, stream, peer, send_tag_nc, recv_tag_nc, local_sz
+            );
+            std::unique_ptr<Buffer> send_buf;
+            if (local_sz > 0) {
+                send_buf = alloc_and_copy_device(br, stream, *nocomp_payloads[i]);
+            }
             exchange_payload(
                 comm,
                 br,
@@ -722,7 +728,7 @@ RunResult run_once(
                 send_tag_nc,
                 recv_tag_nc,
                 std::move(send_buf),
-                recv_size
+                static_cast<std::size_t>(remote_sz)
             );
         }
         auto rt_end = Clock::now();

From ef01b3830671be637101a67f43ec6fb1b63d246a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 06:45:24 -0800
Subject: [PATCH 15/26] Move nvcomp code to librapidsmpf

---
 cpp/CMakeLists.txt                 |   1 +
 cpp/benchmarks/bench_comp_comm.cpp | 179 +----------------------------
 cpp/include/rapidsmpf/nvcomp.h     | 109 ++++++++++++++++++
 cpp/src/nvcomp.cpp                 | 161 ++++++++++++++++++++++++++
 4 files changed, 272 insertions(+), 178 deletions(-)
 create mode 100644 cpp/include/rapidsmpf/nvcomp.h
 create mode 100644 cpp/src/nvcomp.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c473bfd63..da1e92931 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -167,6 +167,7 @@ add_library(
   src/cuda_event.cpp
   src/integrations/cudf/partition.cpp
   src/integrations/cudf/utils.cpp
+  src/nvcomp.cpp
   src/pausable_thread_loop.cpp
   src/progress_thread.cpp
   src/rmm_resource_adaptor.cpp
diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index c025765e6..c7ff27703 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -20,6 +20,7 @@
 #include <cuda_runtime.h>
 #include <glob.h>
 #include <mpi.h>
+#include <rapidsmpf/nvcomp.h>
 #include <unistd.h>
 
 #include <cudf/contiguous_split.hpp>
@@ -48,10 +49,6 @@
 #include "utils/misc.hpp"
 #include "utils/rmm_stack.hpp"
 
-// nvCOMP managers (v3.x API)
-#include <nvcomp/cascaded.hpp>
-#include <nvcomp/lz4.hpp>
-
 using namespace rapidsmpf;
 
 namespace {
@@ -60,19 +57,6 @@ enum class PackMode {
     Table,
     Column
 };
-enum class Algo {
-    Cascaded,
-    LZ4
-};
-
-struct KvParams {
-    // Common
-    std::size_t chunk_size{1 << 20};
-    // Cascaded
-    int cascaded_rle{1};
-    int cascaded_delta{1};
-    int cascaded_bitpack{1};
-};
 
 struct Args {
     std::string comm_type{"mpi"};
@@ -172,167 +156,6 @@ struct PhaseThroughputs {
     double recv_only_Bps{0.0};
 };
 
-class NvcompCodec {
-  public:
-    virtual ~NvcompCodec() = default;
-    virtual std::size_t get_max_compressed_bytes(
-        std::size_t uncompressed_bytes, rmm::cuda_stream_view stream
-    ) = 0;
-    virtual void compress(
-        void const* d_in,
-        std::size_t in_bytes,
-        void* d_out,
-        std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
-    ) = 0;
-    virtual void decompress(
-        void const* d_in,
-        std::size_t in_bytes,
-        void* d_out,
-        std::size_t out_bytes,
-        rmm::cuda_stream_view stream
-    ) = 0;
-};
-
-class LZ4Codec final : public NvcompCodec {
-  public:
-    explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
-
-    std::size_t get_max_compressed_bytes(
-        std::size_t in_bytes, rmm::cuda_stream_view stream
-    ) override {
-        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
-        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
-        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
-        auto cfg = mgr.configure_compression(in_bytes);
-        return cfg.max_compressed_buffer_size;
-    }
-
-    void compress(
-        void const* d_in,
-        std::size_t in_bytes,
-        void* d_out,
-        std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
-    ) override {
-        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
-        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
-        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
-        auto cfg = mgr.configure_compression(in_bytes);
-        size_t* pinned_bytes = nullptr;
-        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
-        ));
-        mgr.compress(
-            static_cast<uint8_t const*>(d_in),
-            static_cast<uint8_t*>(d_out),
-            cfg,
-            pinned_bytes
-        );
-        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        *out_bytes = *pinned_bytes;
-        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
-    }
-
-    void decompress(
-        void const* d_in,
-        std::size_t in_bytes,
-        void* d_out,
-        std::size_t out_bytes,
-        rmm::cuda_stream_view stream
-    ) override {
-        (void)out_bytes;
-        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
-        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
-        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
-        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
-        size_t in_sizes[1] = {in_bytes};
-        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
-        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
-        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
-    }
-
-  private:
-    std::size_t chunk_size_;
-};
-
-class CascadedCodec final : public NvcompCodec {
-  public:
-    CascadedCodec(std::size_t chunk_size, int rle, int delta, int bitpack)
-        : chunk_size_{chunk_size} {
-        copts_ = nvcompBatchedCascadedCompressDefaultOpts;
-        copts_.num_RLEs = rle ? 1 : 0;
-        copts_.num_deltas = delta ? 1 : 0;
-        copts_.use_bp = bitpack ? 1 : 0;
-        dopts_ = nvcompBatchedCascadedDecompressDefaultOpts;
-    }
-
-    std::size_t get_max_compressed_bytes(
-        std::size_t in_bytes, rmm::cuda_stream_view stream
-    ) override {
-        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
-        auto cfg = mgr.configure_compression(in_bytes);
-        return cfg.max_compressed_buffer_size;
-    }
-
-    void compress(
-        void const* d_in,
-        std::size_t in_bytes,
-        void* d_out,
-        std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
-    ) override {
-        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
-        auto cfg = mgr.configure_compression(in_bytes);
-        size_t* pinned_bytes = nullptr;
-        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
-        ));
-        mgr.compress(
-            static_cast<uint8_t const*>(d_in),
-            static_cast<uint8_t*>(d_out),
-            cfg,
-            pinned_bytes
-        );
-        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        *out_bytes = *pinned_bytes;
-        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
-    }
-
-    void decompress(
-        void const* d_in,
-        std::size_t in_bytes,
-        void* d_out,
-        std::size_t out_bytes,
-        rmm::cuda_stream_view stream
-    ) override {
-        (void)out_bytes;
-        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
-        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
-        size_t in_sizes[1] = {in_bytes};
-        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
-        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
-        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
-    }
-
-  private:
-    std::size_t chunk_size_{};
-    nvcompBatchedCascadedCompressOpts_t copts_{};
-    nvcompBatchedCascadedDecompressOpts_t dopts_{};
-};
-
-std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p) {
-    switch (algo) {
-    case Algo::LZ4:
-        return std::make_unique<LZ4Codec>(p.chunk_size);
-    case Algo::Cascaded:
-    default:
-        return std::make_unique<CascadedCodec>(
-            p.chunk_size, p.cascaded_rle, p.cascaded_delta, p.cascaded_bitpack
-        );
-    }
-}
-
 static inline void ensure_ready(Buffer& buf) {
     if (!buf.is_latest_write_done()) {
         buf.stream().synchronize();
diff --git a/cpp/include/rapidsmpf/nvcomp.h b/cpp/include/rapidsmpf/nvcomp.h
new file mode 100644
index 000000000..055fa41a2
--- /dev/null
+++ b/cpp/include/rapidsmpf/nvcomp.h
@@ -0,0 +1,109 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace rapidsmpf {
+
+enum class Algo {
+    Cascaded,
+    LZ4
+};
+
+/**
+ * @brief Parameters for nvCOMP codec configuration
+ *
+ * Holds configuration parameters for both generic and algorithm-specific compression
+ * settings.
+ */
+struct KvParams {
+    /// Chunk size for compression operations (default: 1 MiB)
+    std::size_t chunk_size{1 << 20};
+
+    /// Number of run-length encoding passes in Cascaded codec (must be non-negative,
+    /// default: 1)
+    int cascaded_rle{1};
+
+    /// Number of delta encoding passes in Cascaded codec (must be non-negative, default:
+    /// 1)
+    int cascaded_delta{1};
+
+    /// Enable bitpacking in Cascaded codec (default: enabled)
+    bool cascaded_bitpack{true};
+};
+
+/**
+ * @brief Abstract base class for nvCOMP codec implementations
+ *
+ * Provides a unified interface for different compression algorithms (LZ4, Cascaded, etc.)
+ * to perform compression and decompression operations on GPU device memory.
+ */
+class NvcompCodec {
+  public:
+    virtual ~NvcompCodec() = default;
+
+    /**
+     * @brief Calculate the maximum compressed size for the given input size
+     *
+     * @param uncompressed_bytes Size of the uncompressed data in bytes
+     * @param stream CUDA stream for operations
+     * @return Maximum possible compressed size in bytes
+     */
+    virtual std::size_t get_max_compressed_bytes(
+        std::size_t uncompressed_bytes, rmm::cuda_stream_view stream
+    ) = 0;
+
+    /**
+     * @brief Compress data on the GPU
+     *
+     * @param d_in Pointer to uncompressed data on device
+     * @param in_bytes Size of uncompressed data in bytes
+     * @param d_out Pointer to output buffer on device for compressed data
+     * @param out_bytes Pointer to output variable that will be set to actual compressed
+     * size
+     * @param stream CUDA stream for operations
+     */
+    virtual void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) = 0;
+
+    /**
+     * @brief Decompress data on the GPU
+     *
+     * @param d_in Pointer to compressed data on device
+     * @param in_bytes Size of compressed data in bytes
+     * @param d_out Pointer to output buffer on device for decompressed data
+     * @param out_bytes Expected size of decompressed data in bytes
+     * @param stream CUDA stream for operations
+     */
+    virtual void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) = 0;
+};
+
+/**
+ * @brief Create an nvCOMP codec instance
+ *
+ * @param algo The compression algorithm to use
+ * @param p Parameters for the codec
+ * @return A unique pointer to an NvcompCodec instance
+ */
+std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p);
+
+}  // namespace rapidsmpf
diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp
new file mode 100644
index 000000000..612399c38
--- /dev/null
+++ b/cpp/src/nvcomp.cpp
@@ -0,0 +1,161 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <cuda_runtime.h>
+#include <rapidsmpf/nvcomp.h>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <rapidsmpf/error.hpp>
+
+#include <nvcomp/cascaded.hpp>
+#include <nvcomp/lz4.hpp>
+
+namespace rapidsmpf {
+
+class LZ4Codec final : public NvcompCodec {
+  public:
+    explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
+
+    std::size_t get_max_compressed_bytes(
+        std::size_t in_bytes, rmm::cuda_stream_view stream
+    ) override {
+        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
+        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        return cfg.max_compressed_buffer_size;
+    }
+
+    void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
+        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        size_t* pinned_bytes = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
+        ));
+        mgr.compress(
+            static_cast<uint8_t const*>(d_in),
+            static_cast<uint8_t*>(d_out),
+            cfg,
+            pinned_bytes
+        );
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        *out_bytes = *pinned_bytes;
+        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+    }
+
+    void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        (void)out_bytes;
+        nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
+        nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
+        nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
+        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
+        size_t in_sizes[1] = {in_bytes};
+        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
+        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
+        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
+    }
+
+  private:
+    std::size_t chunk_size_;
+};
+
+class CascadedCodec final : public NvcompCodec {
+  public:
+    CascadedCodec(std::size_t chunk_size, int rle, int delta, bool bitpack)
+        : chunk_size_{chunk_size} {
+        copts_ = nvcompBatchedCascadedCompressDefaultOpts;
+        copts_.num_RLEs = rle;
+        copts_.num_deltas = delta;
+        copts_.use_bp = bitpack ? 1 : 0;
+        dopts_ = nvcompBatchedCascadedDecompressDefaultOpts;
+    }
+
+    std::size_t get_max_compressed_bytes(
+        std::size_t in_bytes, rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        return cfg.max_compressed_buffer_size;
+    }
+
+    void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        size_t* pinned_bytes = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
+        ));
+        mgr.compress(
+            static_cast<uint8_t const*>(d_in),
+            static_cast<uint8_t*>(d_out),
+            cfg,
+            pinned_bytes
+        );
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        *out_bytes = *pinned_bytes;
+        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+    }
+
+    void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        (void)out_bytes;
+        nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
+        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
+        size_t in_sizes[1] = {in_bytes};
+        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
+        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
+        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
+    }
+
+  private:
+    std::size_t chunk_size_{};
+    nvcompBatchedCascadedCompressOpts_t copts_{};
+    nvcompBatchedCascadedDecompressOpts_t dopts_{};
+};
+
+std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p) {
+    switch (algo) {
+    case Algo::LZ4:
+        return std::make_unique<LZ4Codec>(p.chunk_size);
+    case Algo::Cascaded:
+    default:
+        return std::make_unique<CascadedCodec>(
+            p.chunk_size, p.cascaded_rle, p.cascaded_delta, p.cascaded_bitpack
+        );
+    }
+}
+
+}  // namespace rapidsmpf

From 179886d5d9031acd0299b4339c6cc76556f8003a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 09:16:20 -0800
Subject: [PATCH 16/26] Ensure packed data is immediately completed

---
 cpp/benchmarks/bench_comp_comm.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index c7ff27703..76b98b9d8 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -246,9 +246,11 @@ static std::unique_ptr<PackedData> pack_table_to_packed(
     cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br
 ) {
     auto packed = cudf::pack(tv, stream, br->device_mr());
-    return std::make_unique<PackedData>(
+    auto ret = std::make_unique<PackedData>(
         std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream)
     );
+    ret->data->stream().synchronize();
+    return ret;
 }
 
 struct ArgumentParser {
@@ -496,7 +498,7 @@ RunResult run_once(
             continue;
         }
         // Ensure any prior writes to input are completed
-        data.items[i].packed->data->stream().synchronize();
+        // data.items[i].packed->data->stream().synchronize();
         // Launch compression on the output buffer's stream and record an event after
         comp_outputs[i]->write_access(
             [&codec, &data, i, in_bytes, &comp_output_sizes, stream](

From 1e235b6b952ef364b6e784e2e3c15722ace1d033 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 09:46:21 -0800
Subject: [PATCH 17/26] Multi-stream compression

---
 cpp/benchmarks/bench_comp_comm.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 76b98b9d8..20b7936a5 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -491,38 +491,42 @@ RunResult run_once(
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto t0 = Clock::now();
     // Compress all items (single batch) on stream
+    std::vector<rmm::cuda_stream_view> comp_streams(data.items.size());
     for (std::size_t i = 0; i < data.items.size(); ++i) {
         auto const in_bytes = data.items[i].packed->data->size;
         if (in_bytes == 0) {
             comp_output_sizes[i] = 0;
             continue;
         }
-        // Ensure any prior writes to input are completed
-        // data.items[i].packed->data->stream().synchronize();
         // Launch compression on the output buffer's stream and record an event after
         comp_outputs[i]->write_access(
-            [&codec, &data, i, in_bytes, &comp_output_sizes, stream](
+            [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams](
                 std::byte* out_ptr, rmm::cuda_stream_view out_stream
             ) {
                 (void)out_ptr;  // pointer used below
                 // Lock input for raw pointer access
                 auto* in_raw = data.items[i].packed->data->exclusive_data_access();
-                std::size_t out_bytes = 0;
                 codec.compress(
                     static_cast<void const*>(in_raw),
                     in_bytes,
                     static_cast<void*>(out_ptr),
-                    &out_bytes,
+                    &comp_output_sizes[i],
                     out_stream
                 );
-                // Ensure comp_bytes is populated before returning
-                RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(out_stream.value()));
-                data.items[i].packed->data->unlock();
-                comp_output_sizes[i] = out_bytes;
+                // Defer synchronization and unlock; record stream for later sync
+                comp_streams[i] = out_stream;
             }
         );
     }
-    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
+    // Synchronize streams and unlock inputs
+    for (std::size_t i = 0; i < data.items.size(); ++i) {
+        auto const in_bytes = data.items[i].packed->data->size;
+        if (in_bytes == 0) {
+            continue;
+        }
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(comp_streams[i].value()));
+        data.items[i].packed->data->unlock();
+    }
     auto t1 = Clock::now();
 
     // Phase A (RTT no compression): ping-pong per op (sequential per item to avoid

From 4a79b842cc4b34ee6a34befb6448e10da7a49da6 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 09:49:05 -0800
Subject: [PATCH 18/26] Multi-stream decompression

---
 cpp/benchmarks/bench_comp_comm.cpp | 42 ++++++++++++++++++------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 20b7936a5..4ac59066c 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -606,6 +606,7 @@ RunResult run_once(
     // Decompress received buffers (simulate by decompressing our own produced outputs in
     // symmetric setup)
     auto c0 = Clock::now();
+    std::vector<rmm::cuda_stream_view> decomp_streams(data.items.size());
     for (std::size_t i = 0; i < data.items.size(); ++i) {
         auto const out_bytes = data.items[i].packed->data->size;
         if (out_bytes == 0) {
@@ -613,23 +614,32 @@ RunResult run_once(
         }
         auto res = br->reserve_or_fail(out_bytes, MemoryType::DEVICE);
         auto out = br->allocate(out_bytes, stream, res);
-        // Ensure compressed outputs are ready before using as input
-        comp_outputs[i]->stream().synchronize();
-        out->write_access([&codec, &comp_outputs, &comp_output_sizes, i, out_bytes](
-                              std::byte* out_ptr, rmm::cuda_stream_view out_stream
-                          ) {
-            auto* in_raw = comp_outputs[i]->exclusive_data_access();
-            codec.decompress(
-                static_cast<void const*>(in_raw),
-                comp_output_sizes[i],
-                static_cast<void*>(out_ptr),
-                out_bytes,
-                out_stream
-            );
-            comp_outputs[i]->unlock();
-        });
+        out->write_access(
+            [&codec, &comp_outputs, &comp_output_sizes, &decomp_streams, i, out_bytes](
+                std::byte* out_ptr, rmm::cuda_stream_view out_stream
+            ) {
+                auto* in_raw = comp_outputs[i]->exclusive_data_access();
+                codec.decompress(
+                    static_cast<void const*>(in_raw),
+                    comp_output_sizes[i],
+                    static_cast<void*>(out_ptr),
+                    out_bytes,
+                    out_stream
+                );
+                // Defer unlock until after per-stream synchronization
+                decomp_streams[i] = out_stream;
+            }
+        );
+    }
+    // Synchronize each decomp stream and then unlock the corresponding input
+    for (std::size_t i = 0; i < data.items.size(); ++i) {
+        auto const out_bytes = data.items[i].packed->data->size;
+        if (out_bytes == 0) {
+            continue;
+        }
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(decomp_streams[i].value()));
+        comp_outputs[i]->unlock();
     }
-    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto c1 = Clock::now();
 
     RunResult result{};

From c3d6c792383268839da33e58991b5a6d95570cb7 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 11:11:28 -0800
Subject: [PATCH 19/26] Add Zstd and Snappy compression

---
 cpp/benchmarks/bench_comp_comm.cpp |  10 ++-
 cpp/include/rapidsmpf/nvcomp.h     |   4 +-
 cpp/src/nvcomp.cpp                 | 133 +++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 4ac59066c..8032b509c 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -301,9 +301,14 @@ struct ArgumentParser {
                             args_.algo = Algo::Cascaded;
                         else if (v == "lz4")
                             args_.algo = Algo::LZ4;
+                        else if (v == "zstd")
+                            args_.algo = Algo::Zstd;
+                        else if (v == "snappy")
+                            args_.algo = Algo::Snappy;
                         else
                             RAPIDSMPF_FAIL(
-                                "-A must be one of {cascaded, lz4}", std::invalid_argument
+                                "-A must be one of {cascaded, lz4, zstd, snappy}",
+                                std::invalid_argument
                             );
                         break;
                     }
@@ -331,7 +336,8 @@ struct ArgumentParser {
                            << "  -F <pattern> Parquet file glob/pattern (required)\n"
                            << "  -P <mode>    Packing mode {table, column} (default: "
                               "table)\n"
-                           << "  -A <algo>    {cascaded, lz4} (default: cascaded)\n"
+                           << "  -A <algo>    {cascaded, lz4, zstd, snappy} (default: "
+                              "cascaded)\n"
                            << "  -K <kv>      Algo params, e.g. "
                               "chunk_size=1MiB,delta=1,rle=1,bitpack=1\n"
                            << "  -p <num>     Number of concurrent ops (default: 1)\n"
diff --git a/cpp/include/rapidsmpf/nvcomp.h b/cpp/include/rapidsmpf/nvcomp.h
index 055fa41a2..ef4d367fd 100644
--- a/cpp/include/rapidsmpf/nvcomp.h
+++ b/cpp/include/rapidsmpf/nvcomp.h
@@ -15,7 +15,9 @@ namespace rapidsmpf {
 
 enum class Algo {
     Cascaded,
-    LZ4
+    LZ4,
+    Zstd,
+    Snappy
 };
 
 /**
diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp
index 612399c38..280135235 100644
--- a/cpp/src/nvcomp.cpp
+++ b/cpp/src/nvcomp.cpp
@@ -16,6 +16,8 @@
 
 #include <nvcomp/cascaded.hpp>
 #include <nvcomp/lz4.hpp>
+#include <nvcomp/snappy.hpp>
+#include <nvcomp/zstd.hpp>
 
 namespace rapidsmpf {
 
@@ -146,10 +148,141 @@ class CascadedCodec final : public NvcompCodec {
     nvcompBatchedCascadedDecompressOpts_t dopts_{};
 };
 
+class SnappyCodec final : public NvcompCodec {
+  public:
+    explicit SnappyCodec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
+
+    std::size_t get_max_compressed_bytes(
+        std::size_t in_bytes, rmm::cuda_stream_view stream
+    ) override {
+        nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts;
+        nvcompBatchedSnappyDecompressOpts_t dopts =
+            nvcompBatchedSnappyDecompressDefaultOpts;
+        nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        return cfg.max_compressed_buffer_size;
+    }
+
+    void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts;
+        nvcompBatchedSnappyDecompressOpts_t dopts =
+            nvcompBatchedSnappyDecompressDefaultOpts;
+        nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        size_t* pinned_bytes = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
+        ));
+        mgr.compress(
+            static_cast<uint8_t const*>(d_in),
+            static_cast<uint8_t*>(d_out),
+            cfg,
+            pinned_bytes
+        );
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        *out_bytes = *pinned_bytes;
+        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+    }
+
+    void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        (void)out_bytes;
+        nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts;
+        nvcompBatchedSnappyDecompressOpts_t dopts =
+            nvcompBatchedSnappyDecompressDefaultOpts;
+        nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()};
+        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
+        size_t in_sizes[1] = {in_bytes};
+        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
+        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
+        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
+    }
+
+  private:
+    std::size_t chunk_size_;
+};
+
+class ZstdCodec final : public NvcompCodec {
+  public:
+    explicit ZstdCodec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
+
+    std::size_t get_max_compressed_bytes(
+        std::size_t in_bytes, rmm::cuda_stream_view stream
+    ) override {
+        nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts;
+        nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts;
+        nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        return cfg.max_compressed_buffer_size;
+    }
+
+    void compress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t* out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts;
+        nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts;
+        nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()};
+        auto cfg = mgr.configure_compression(in_bytes);
+        size_t* pinned_bytes = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
+        ));
+        mgr.compress(
+            static_cast<uint8_t const*>(d_in),
+            static_cast<uint8_t*>(d_out),
+            cfg,
+            pinned_bytes
+        );
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        *out_bytes = *pinned_bytes;
+        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+    }
+
+    void decompress(
+        void const* d_in,
+        std::size_t in_bytes,
+        void* d_out,
+        std::size_t out_bytes,
+        rmm::cuda_stream_view stream
+    ) override {
+        (void)out_bytes;
+        nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts;
+        nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts;
+        nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()};
+        const uint8_t* in_ptrs[1] = {static_cast<uint8_t const*>(d_in)};
+        size_t in_sizes[1] = {in_bytes};
+        auto cfgs = mgr.configure_decompression(in_ptrs, 1, in_sizes);
+        uint8_t* out_ptrs[1] = {static_cast<uint8_t*>(d_out)};
+        mgr.decompress(out_ptrs, in_ptrs, cfgs, nullptr);
+    }
+
+  private:
+    std::size_t chunk_size_;
+};
+
 std::unique_ptr<NvcompCodec> make_codec(Algo algo, KvParams const& p) {
     switch (algo) {
     case Algo::LZ4:
         return std::make_unique<LZ4Codec>(p.chunk_size);
+    case Algo::Zstd:
+        return std::make_unique<ZstdCodec>(p.chunk_size);
+    case Algo::Snappy:
+        return std::make_unique<SnappyCodec>(p.chunk_size);
     case Algo::Cascaded:
     default:
         return std::make_unique<CascadedCodec>(

From a671b4724154538694eb277cce9d3ae329893fb8 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Nov 2025 13:56:43 -0800
Subject: [PATCH 20/26] Allow passing BufferResource to compress()

---
 cpp/benchmarks/bench_comp_comm.cpp |   5 +-
 cpp/include/rapidsmpf/nvcomp.h     |   7 +-
 cpp/src/nvcomp.cpp                 | 193 ++++++++++++++++++++---------
 3 files changed, 146 insertions(+), 59 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 8032b509c..f2cccb316 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -506,7 +506,7 @@ RunResult run_once(
         }
         // Launch compression on the output buffer's stream and record an event after
         comp_outputs[i]->write_access(
-            [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams](
+            [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams, br](
                 std::byte* out_ptr, rmm::cuda_stream_view out_stream
             ) {
                 (void)out_ptr;  // pointer used below
@@ -517,7 +517,8 @@ RunResult run_once(
                     in_bytes,
                     static_cast<void*>(out_ptr),
                     &comp_output_sizes[i],
-                    out_stream
+                    out_stream,
+                    br
                 );
                 // Defer synchronization and unlock; record stream for later sync
                 comp_streams[i] = out_stream;
diff --git a/cpp/include/rapidsmpf/nvcomp.h b/cpp/include/rapidsmpf/nvcomp.h
index ef4d367fd..5b4038348 100644
--- a/cpp/include/rapidsmpf/nvcomp.h
+++ b/cpp/include/rapidsmpf/nvcomp.h
@@ -13,6 +13,8 @@
 
 namespace rapidsmpf {
 
+class BufferResource;  // forward declaration
+
 enum class Algo {
     Cascaded,
     LZ4,
@@ -72,13 +74,16 @@ class NvcompCodec {
      * @param out_bytes Pointer to output variable that will be set to actual compressed
      * size
      * @param stream CUDA stream for operations
+     * @param br Optional buffer resource used for temporary allocations (e.g., to capture
+     * compressed size on device and copy back to host).
      */
     virtual void compress(
         void const* d_in,
         std::size_t in_bytes,
         void* d_out,
         std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
+        rmm::cuda_stream_view stream,
+        BufferResource* br = nullptr
     ) = 0;
 
     /**
diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp
index 280135235..5091b69d0 100644
--- a/cpp/src/nvcomp.cpp
+++ b/cpp/src/nvcomp.cpp
@@ -12,6 +12,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <rapidsmpf/buffer/resource.hpp>
 #include <rapidsmpf/error.hpp>
 
 #include <nvcomp/cascaded.hpp>
@@ -40,25 +41,45 @@ class LZ4Codec final : public NvcompCodec {
         std::size_t in_bytes,
         void* d_out,
         std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
+        rmm::cuda_stream_view stream,
+        BufferResource* br
     ) override {
         nvcompBatchedLZ4CompressOpts_t copts = nvcompBatchedLZ4CompressDefaultOpts;
         nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
         nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        size_t* pinned_bytes = nullptr;
-        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
-        ));
-        mgr.compress(
-            static_cast<uint8_t const*>(d_in),
-            static_cast<uint8_t*>(d_out),
-            cfg,
-            pinned_bytes
-        );
-        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        *out_bytes = *pinned_bytes;
-        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        if (br != nullptr) {
+            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
+            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
+            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+                mgr.compress(
+                    static_cast<uint8_t const*>(d_in),
+                    static_cast<uint8_t*>(d_out),
+                    cfg,
+                    reinterpret_cast<size_t*>(sz_ptr)
+                );
+                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
+                ));
+            });
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        } else {
+            size_t* pinned_bytes = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+                reinterpret_cast<void**>(&pinned_bytes),
+                sizeof(size_t),
+                cudaHostAllocDefault
+            ));
+            mgr.compress(
+                static_cast<uint8_t const*>(d_in),
+                static_cast<uint8_t*>(d_out),
+                cfg,
+                pinned_bytes
+            );
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+            *out_bytes = *pinned_bytes;
+            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        }
     }
 
     void decompress(
@@ -107,23 +128,43 @@ class CascadedCodec final : public NvcompCodec {
         std::size_t in_bytes,
         void* d_out,
         std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
+        rmm::cuda_stream_view stream,
+        BufferResource* br
     ) override {
         nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        size_t* pinned_bytes = nullptr;
-        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
-        ));
-        mgr.compress(
-            static_cast<uint8_t const*>(d_in),
-            static_cast<uint8_t*>(d_out),
-            cfg,
-            pinned_bytes
-        );
-        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        *out_bytes = *pinned_bytes;
-        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        if (br != nullptr) {
+            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
+            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
+            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+                mgr.compress(
+                    static_cast<uint8_t const*>(d_in),
+                    static_cast<uint8_t*>(d_out),
+                    cfg,
+                    reinterpret_cast<size_t*>(sz_ptr)
+                );
+                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
+                ));
+            });
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        } else {
+            size_t* pinned_bytes = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+                reinterpret_cast<void**>(&pinned_bytes),
+                sizeof(size_t),
+                cudaHostAllocDefault
+            ));
+            mgr.compress(
+                static_cast<uint8_t const*>(d_in),
+                static_cast<uint8_t*>(d_out),
+                cfg,
+                pinned_bytes
+            );
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+            *out_bytes = *pinned_bytes;
+            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        }
     }
 
     void decompress(
@@ -168,26 +209,46 @@ class SnappyCodec final : public NvcompCodec {
         std::size_t in_bytes,
         void* d_out,
         std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
+        rmm::cuda_stream_view stream,
+        BufferResource* br
     ) override {
         nvcompBatchedSnappyCompressOpts_t copts = nvcompBatchedSnappyCompressDefaultOpts;
         nvcompBatchedSnappyDecompressOpts_t dopts =
             nvcompBatchedSnappyDecompressDefaultOpts;
         nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        size_t* pinned_bytes = nullptr;
-        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
-        ));
-        mgr.compress(
-            static_cast<uint8_t const*>(d_in),
-            static_cast<uint8_t*>(d_out),
-            cfg,
-            pinned_bytes
-        );
-        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        *out_bytes = *pinned_bytes;
-        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        if (br != nullptr) {
+            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
+            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
+            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+                mgr.compress(
+                    static_cast<uint8_t const*>(d_in),
+                    static_cast<uint8_t*>(d_out),
+                    cfg,
+                    reinterpret_cast<size_t*>(sz_ptr)
+                );
+                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
+                ));
+            });
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        } else {
+            size_t* pinned_bytes = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+                reinterpret_cast<void**>(&pinned_bytes),
+                sizeof(size_t),
+                cudaHostAllocDefault
+            ));
+            mgr.compress(
+                static_cast<uint8_t const*>(d_in),
+                static_cast<uint8_t*>(d_out),
+                cfg,
+                pinned_bytes
+            );
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+            *out_bytes = *pinned_bytes;
+            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        }
     }
 
     void decompress(
@@ -232,25 +293,45 @@ class ZstdCodec final : public NvcompCodec {
         std::size_t in_bytes,
         void* d_out,
         std::size_t* out_bytes,
-        rmm::cuda_stream_view stream
+        rmm::cuda_stream_view stream,
+        BufferResource* br
     ) override {
         nvcompBatchedZstdCompressOpts_t copts = nvcompBatchedZstdCompressDefaultOpts;
         nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts;
         nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        size_t* pinned_bytes = nullptr;
-        RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-            reinterpret_cast<void**>(&pinned_bytes), sizeof(size_t), cudaHostAllocDefault
-        ));
-        mgr.compress(
-            static_cast<uint8_t const*>(d_in),
-            static_cast<uint8_t*>(d_out),
-            cfg,
-            pinned_bytes
-        );
-        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        *out_bytes = *pinned_bytes;
-        RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        if (br != nullptr) {
+            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
+            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
+            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+                mgr.compress(
+                    static_cast<uint8_t const*>(d_in),
+                    static_cast<uint8_t*>(d_out),
+                    cfg,
+                    reinterpret_cast<size_t*>(sz_ptr)
+                );
+                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
+                ));
+            });
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        } else {
+            size_t* pinned_bytes = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
+                reinterpret_cast<void**>(&pinned_bytes),
+                sizeof(size_t),
+                cudaHostAllocDefault
+            ));
+            mgr.compress(
+                static_cast<uint8_t const*>(d_in),
+                static_cast<uint8_t*>(d_out),
+                cfg,
+                pinned_bytes
+            );
+            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+            *out_bytes = *pinned_bytes;
+            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+        }
     }
 
     void decompress(

From 4cc54e4a8e3026b44870989b421229f4b2ad6d40 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Nov 2025 01:40:45 -0800
Subject: [PATCH 21/26] Use cudaMallocAsync when BufferResource is not
 specified

---
 cpp/src/nvcomp.cpp | 64 +++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp
index 5091b69d0..e39001797 100644
--- a/cpp/src/nvcomp.cpp
+++ b/cpp/src/nvcomp.cpp
@@ -64,21 +64,21 @@ class LZ4Codec final : public NvcompCodec {
             });
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
         } else {
-            size_t* pinned_bytes = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-                reinterpret_cast<void**>(&pinned_bytes),
-                sizeof(size_t),
-                cudaHostAllocDefault
+            size_t* d_size = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
+                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
             ));
             mgr.compress(
                 static_cast<uint8_t const*>(d_in),
                 static_cast<uint8_t*>(d_out),
                 cfg,
-                pinned_bytes
+                d_size
             );
+            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
+            ));
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            *out_bytes = *pinned_bytes;
-            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
         }
     }
 
@@ -149,21 +149,21 @@ class CascadedCodec final : public NvcompCodec {
             });
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
         } else {
-            size_t* pinned_bytes = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-                reinterpret_cast<void**>(&pinned_bytes),
-                sizeof(size_t),
-                cudaHostAllocDefault
+            size_t* d_size = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
+                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
             ));
             mgr.compress(
                 static_cast<uint8_t const*>(d_in),
                 static_cast<uint8_t*>(d_out),
                 cfg,
-                pinned_bytes
+                d_size
             );
+            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
+            ));
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            *out_bytes = *pinned_bytes;
-            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
         }
     }
 
@@ -233,21 +233,21 @@ class SnappyCodec final : public NvcompCodec {
             });
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
         } else {
-            size_t* pinned_bytes = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-                reinterpret_cast<void**>(&pinned_bytes),
-                sizeof(size_t),
-                cudaHostAllocDefault
+            size_t* d_size = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
+                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
             ));
             mgr.compress(
                 static_cast<uint8_t const*>(d_in),
                 static_cast<uint8_t*>(d_out),
                 cfg,
-                pinned_bytes
+                d_size
             );
+            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
+            ));
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            *out_bytes = *pinned_bytes;
-            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
         }
     }
 
@@ -316,21 +316,21 @@ class ZstdCodec final : public NvcompCodec {
             });
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
         } else {
-            size_t* pinned_bytes = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaHostAlloc(
-                reinterpret_cast<void**>(&pinned_bytes),
-                sizeof(size_t),
-                cudaHostAllocDefault
+            size_t* d_size = nullptr;
+            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
+                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
             ));
             mgr.compress(
                 static_cast<uint8_t const*>(d_in),
                 static_cast<uint8_t*>(d_out),
                 cfg,
-                pinned_bytes
+                d_size
             );
+            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
+            ));
             RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            *out_bytes = *pinned_bytes;
-            RAPIDSMPF_CUDA_TRY(cudaFreeHost(pinned_bytes));
+            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
         }
     }
 

From 5adcea74efd48e60a6ff6818baadadad07912cfc Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Nov 2025 01:44:22 -0800
Subject: [PATCH 22/26] Use a helper allocation function

---
 cpp/src/nvcomp.cpp | 173 +++++++++++++++++----------------------------
 1 file changed, 65 insertions(+), 108 deletions(-)

diff --git a/cpp/src/nvcomp.cpp b/cpp/src/nvcomp.cpp
index e39001797..9abad0310 100644
--- a/cpp/src/nvcomp.cpp
+++ b/cpp/src/nvcomp.cpp
@@ -22,6 +22,39 @@
 
 namespace rapidsmpf {
 
+namespace {
+template <typename InvokeCompressFn>
+void invoke_compress_with_device_size_buffer(
+    InvokeCompressFn&& invoke_compress,
+    std::size_t* out_bytes,
+    rmm::cuda_stream_view stream,
+    BufferResource* br
+) {
+    if (br != nullptr) {
+        auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
+        auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
+        size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+            invoke_compress(reinterpret_cast<size_t*>(sz_ptr));
+            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
+            ));
+        });
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+    } else {
+        size_t* d_size = nullptr;
+        RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
+            reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
+        ));
+        invoke_compress(d_size);
+        RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+            out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
+        ));
+        RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
+        RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
+    }
+}
+}  // namespace
+
 class LZ4Codec final : public NvcompCodec {
   public:
     explicit LZ4Codec(std::size_t chunk_size) : chunk_size_{chunk_size} {}
@@ -48,38 +81,19 @@ class LZ4Codec final : public NvcompCodec {
         nvcompBatchedLZ4DecompressOpts_t dopts = nvcompBatchedLZ4DecompressDefaultOpts;
         nvcomp::LZ4Manager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        if (br != nullptr) {
-            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
-            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
-            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+        invoke_compress_with_device_size_buffer(
+            [&](size_t* sz_ptr) {
                 mgr.compress(
                     static_cast<uint8_t const*>(d_in),
                     static_cast<uint8_t*>(d_out),
                     cfg,
-                    reinterpret_cast<size_t*>(sz_ptr)
+                    sz_ptr
                 );
-                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
-                ));
-            });
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        } else {
-            size_t* d_size = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
-                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
-            ));
-            mgr.compress(
-                static_cast<uint8_t const*>(d_in),
-                static_cast<uint8_t*>(d_out),
-                cfg,
-                d_size
-            );
-            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
-            ));
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
-        }
+            },
+            out_bytes,
+            stream,
+            br
+        );
     }
 
     void decompress(
@@ -133,38 +147,19 @@ class CascadedCodec final : public NvcompCodec {
     ) override {
         nvcomp::CascadedManager mgr{chunk_size_, copts_, dopts_, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        if (br != nullptr) {
-            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
-            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
-            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+        invoke_compress_with_device_size_buffer(
+            [&](size_t* sz_ptr) {
                 mgr.compress(
                     static_cast<uint8_t const*>(d_in),
                     static_cast<uint8_t*>(d_out),
                     cfg,
-                    reinterpret_cast<size_t*>(sz_ptr)
+                    sz_ptr
                 );
-                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
-                ));
-            });
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        } else {
-            size_t* d_size = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
-                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
-            ));
-            mgr.compress(
-                static_cast<uint8_t const*>(d_in),
-                static_cast<uint8_t*>(d_out),
-                cfg,
-                d_size
-            );
-            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
-            ));
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
-        }
+            },
+            out_bytes,
+            stream,
+            br
+        );
     }
 
     void decompress(
@@ -217,38 +212,19 @@ class SnappyCodec final : public NvcompCodec {
             nvcompBatchedSnappyDecompressDefaultOpts;
         nvcomp::SnappyManager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        if (br != nullptr) {
-            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
-            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
-            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+        invoke_compress_with_device_size_buffer(
+            [&](size_t* sz_ptr) {
                 mgr.compress(
                     static_cast<uint8_t const*>(d_in),
                     static_cast<uint8_t*>(d_out),
                     cfg,
-                    reinterpret_cast<size_t*>(sz_ptr)
+                    sz_ptr
                 );
-                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
-                ));
-            });
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        } else {
-            size_t* d_size = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
-                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
-            ));
-            mgr.compress(
-                static_cast<uint8_t const*>(d_in),
-                static_cast<uint8_t*>(d_out),
-                cfg,
-                d_size
-            );
-            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
-            ));
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
-        }
+            },
+            out_bytes,
+            stream,
+            br
+        );
     }
 
     void decompress(
@@ -300,38 +276,19 @@ class ZstdCodec final : public NvcompCodec {
         nvcompBatchedZstdDecompressOpts_t dopts = nvcompBatchedZstdDecompressDefaultOpts;
         nvcomp::ZstdManager mgr{chunk_size_, copts, dopts, stream.value()};
         auto cfg = mgr.configure_compression(in_bytes);
-        if (br != nullptr) {
-            auto reservation = br->reserve_or_fail(sizeof(size_t), MemoryType::DEVICE);
-            auto size_buf = br->allocate(sizeof(size_t), stream, reservation);
-            size_buf->write_access([&](std::byte* sz_ptr, rmm::cuda_stream_view s) {
+        invoke_compress_with_device_size_buffer(
+            [&](size_t* sz_ptr) {
                 mgr.compress(
                     static_cast<uint8_t const*>(d_in),
                     static_cast<uint8_t*>(d_out),
                     cfg,
-                    reinterpret_cast<size_t*>(sz_ptr)
+                    sz_ptr
                 );
-                RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                    out_bytes, sz_ptr, sizeof(size_t), cudaMemcpyDeviceToHost, s
-                ));
-            });
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-        } else {
-            size_t* d_size = nullptr;
-            RAPIDSMPF_CUDA_TRY(cudaMallocAsync(
-                reinterpret_cast<void**>(&d_size), sizeof(size_t), stream.value()
-            ));
-            mgr.compress(
-                static_cast<uint8_t const*>(d_in),
-                static_cast<uint8_t*>(d_out),
-                cfg,
-                d_size
-            );
-            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                out_bytes, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream.value()
-            ));
-            RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(stream.value()));
-            RAPIDSMPF_CUDA_TRY(cudaFreeAsync(d_size, stream.value()));
-        }
+            },
+            out_bytes,
+            stream,
+            br
+        );
     }
 
     void decompress(

From 721dda91947dc0e6e6f1237882db4c74b353c834 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Nov 2025 02:09:25 -0800
Subject: [PATCH 23/26] Gather packing time

---
 cpp/benchmarks/bench_comp_comm.cpp | 75 ++++++++++++++++++------------
 1 file changed, 45 insertions(+), 30 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index f2cccb316..4b39b1ea4 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -430,6 +430,7 @@ struct SizeHeader {
 };
 
 struct Timings {
+    double pack_s{0.0};
     double compress_s{0.0};
     double decompress_s{0.0};
     // Round-trip totals measured at initiator
@@ -454,7 +455,8 @@ RunResult run_once(
     rmm::cuda_stream_view stream,
     BufferResource* br,
     std::shared_ptr<Statistics> const& statistics,
-    BuffersToSend const& data,
+    cudf::table const& table,
+    PackMode pack_mode,
     NvcompCodec& codec,
     std::uint64_t run_index
 ) {
@@ -469,10 +471,15 @@ RunResult run_once(
     Tag tag_ping_c{11, 0};
     Tag tag_pong_c{11, 1};
 
+    // Pack data per iteration
+    auto p0 = Clock::now();
+    auto packed = make_packed_items(table, pack_mode, stream, br);
+    auto p1 = Clock::now();
+
     // Clone packed items into raw device buffers for repeated ops
     std::vector<std::unique_ptr<Buffer>> nocomp_payloads;
-    nocomp_payloads.reserve(data.items.size());
-    for (auto const& it : data.items) {
+    nocomp_payloads.reserve(packed.items.size());
+    for (auto const& it : packed.items) {
         // Copy metadata + data into a contiguous device buffer for pure send path?
         // For pure send/recv, we only send the device payload; metadata isn't needed for
         // metrics. We'll send the packed->data buffer.
@@ -484,10 +491,10 @@ RunResult run_once(
 
     // Pre-allocate compression outputs for each item
     std::vector<std::unique_ptr<Buffer>> comp_outputs;
-    std::vector<std::size_t> comp_output_sizes(data.items.size());
-    comp_outputs.reserve(data.items.size());
-    for (std::size_t i = 0; i < data.items.size(); ++i) {
-        auto const in_bytes = data.items[i].packed->data->size;
+    std::vector<std::size_t> comp_output_sizes(packed.items.size());
+    comp_outputs.reserve(packed.items.size());
+    for (std::size_t i = 0; i < packed.items.size(); ++i) {
+        auto const in_bytes = packed.items[i].packed->data->size;
         std::size_t const max_out =
             (in_bytes == 0) ? 1 : codec.get_max_compressed_bytes(in_bytes, stream);
         auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE);
@@ -497,21 +504,21 @@ RunResult run_once(
     RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto t0 = Clock::now();
     // Compress all items (single batch) on stream
-    std::vector<rmm::cuda_stream_view> comp_streams(data.items.size());
-    for (std::size_t i = 0; i < data.items.size(); ++i) {
-        auto const in_bytes = data.items[i].packed->data->size;
+    std::vector<rmm::cuda_stream_view> comp_streams(packed.items.size());
+    for (std::size_t i = 0; i < packed.items.size(); ++i) {
+        auto const in_bytes = packed.items[i].packed->data->size;
         if (in_bytes == 0) {
             comp_output_sizes[i] = 0;
             continue;
         }
         // Launch compression on the output buffer's stream and record an event after
         comp_outputs[i]->write_access(
-            [&codec, &data, i, in_bytes, &comp_output_sizes, &comp_streams, br](
+            [&codec, &packed, i, in_bytes, &comp_output_sizes, &comp_streams, br](
                 std::byte* out_ptr, rmm::cuda_stream_view out_stream
             ) {
                 (void)out_ptr;  // pointer used below
                 // Lock input for raw pointer access
-                auto* in_raw = data.items[i].packed->data->exclusive_data_access();
+                auto* in_raw = packed.items[i].packed->data->exclusive_data_access();
                 codec.compress(
                     static_cast<void const*>(in_raw),
                     in_bytes,
@@ -526,13 +533,13 @@ RunResult run_once(
         );
     }
     // Synchronize streams and unlock inputs
-    for (std::size_t i = 0; i < data.items.size(); ++i) {
-        auto const in_bytes = data.items[i].packed->data->size;
+    for (std::size_t i = 0; i < packed.items.size(); ++i) {
+        auto const in_bytes = packed.items[i].packed->data->size;
         if (in_bytes == 0) {
             continue;
         }
         RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(comp_streams[i].value()));
-        data.items[i].packed->data->unlock();
+        packed.items[i].packed->data->unlock();
     }
     auto t1 = Clock::now();
 
@@ -581,7 +588,7 @@ RunResult run_once(
         Rank peer = initiator ? dst : src;
         Tag send_tag_c = initiator ? tag_ping_c : tag_pong_c;
         Tag recv_tag_c = initiator ? tag_pong_c : tag_ping_c;
-        for (std::size_t i = 0; i < data.items.size(); ++i) {
+        for (std::size_t i = 0; i < packed.items.size(); ++i) {
             // Header exchange: send our size, receive peer size
             std::uint64_t local_sz = static_cast<std::uint64_t>(comp_output_sizes[i]);
             std::uint64_t remote_sz = exchange_u64_header(
@@ -613,9 +620,9 @@ RunResult run_once(
     // Decompress received buffers (simulate by decompressing our own produced outputs in
     // symmetric setup)
     auto c0 = Clock::now();
-    std::vector<rmm::cuda_stream_view> decomp_streams(data.items.size());
-    for (std::size_t i = 0; i < data.items.size(); ++i) {
-        auto const out_bytes = data.items[i].packed->data->size;
+    std::vector<rmm::cuda_stream_view> decomp_streams(packed.items.size());
+    for (std::size_t i = 0; i < packed.items.size(); ++i) {
+        auto const out_bytes = packed.items[i].packed->data->size;
         if (out_bytes == 0) {
             continue;
         }
@@ -639,8 +646,8 @@ RunResult run_once(
         );
     }
     // Synchronize each decomp stream and then unlock the corresponding input
-    for (std::size_t i = 0; i < data.items.size(); ++i) {
-        auto const out_bytes = data.items[i].packed->data->size;
+    for (std::size_t i = 0; i < packed.items.size(); ++i) {
+        auto const out_bytes = packed.items[i].packed->data->size;
         if (out_bytes == 0) {
             continue;
         }
@@ -650,13 +657,14 @@ RunResult run_once(
     auto c1 = Clock::now();
 
     RunResult result{};
+    result.times.pack_s = std::chrono::duration<double>(p1 - p0).count();
     result.times.compress_s = std::chrono::duration<double>(t1 - t0).count();
     result.times.rt_nocomp_s = rt_nc_total.count();
     result.times.rt_comp_s = rt_c_total.count();
     result.times.decompress_s = std::chrono::duration<double>(c1 - c0).count();
 
     // Use payload (device) bytes as the logical uncompressed size for throughput
-    result.counts.logical_uncompressed_bytes = data.total_payload_bytes * args.num_ops;
+    result.counts.logical_uncompressed_bytes = packed.total_payload_bytes * args.num_ops;
     result.counts.logical_compressed_bytes =
         std::accumulate(
             comp_output_sizes.begin(), comp_output_sizes.end(), std::size_t{0}
@@ -769,28 +777,30 @@ int main(int argc, char** argv) {
     auto table_with_md = cudf::io::read_parquet(reader_opts);
     auto& table = table_with_md.tbl;
 
-    // Pack per mode
-    auto packed = make_packed_items(*table, args.pack_mode, stream, &br);
-
     // Prepare codec
     auto codec = make_codec(args.algo, args.params);
 
     // Runs
-    std::vector<double> compress_t, decompress_t, rt_nc_t, rt_c_t;
+    std::vector<double> pack_t, compress_t, decompress_t, rt_nc_t, rt_c_t;
+    pack_t.reserve(args.num_runs);
     compress_t.reserve(args.num_runs);
     decompress_t.reserve(args.num_runs);
     rt_nc_t.reserve(args.num_runs);
     rt_c_t.reserve(args.num_runs);
 
-    std::size_t logical_bytes = packed.total_uncompressed_bytes * args.num_ops;
+    std::size_t logical_bytes = 0;
     std::size_t logical_compressed_bytes_last = 0;
 
     for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) {
         if (i == args.num_warmups + args.num_runs - 1) {
             stats = std::make_shared<rapidsmpf::Statistics>(/* enable = */ true);
         }
-        auto rr = run_once(comm, args, stream, &br, stats, packed, *codec, i);
+        auto rr =
+            run_once(comm, args, stream, &br, stats, *table, args.pack_mode, *codec, i);
 
+        logical_bytes = rr.counts.logical_uncompressed_bytes;
+        double pBps =
+            static_cast<double>(rr.counts.logical_uncompressed_bytes) / rr.times.pack_s;
         double cBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
                       / rr.times.compress_s;
         double dBps = static_cast<double>(rr.counts.logical_uncompressed_bytes)
@@ -815,7 +825,7 @@ int main(int argc, char** argv) {
                 : 0.0;
 
         std::stringstream ss;
-        ss << "compress: " << format_nbytes(cBps)
+        ss << "pack: " << format_nbytes(pBps) << "/s | compress: " << format_nbytes(cBps)
            << "/s | decompress: " << format_nbytes(dBps)
            << "/s | rt(nocomp): " << format_nbytes(rt_nc_Bps)
            << "/s | rt(comp): " << format_nbytes(rt_c_Bps) << "/s"
@@ -825,6 +835,9 @@ int main(int argc, char** argv) {
         log.print(ss.str());
 
         if (i >= args.num_warmups) {
+            pack_t.push_back(
+                static_cast<double>(rr.counts.logical_uncompressed_bytes) / pBps
+            );
             compress_t.push_back(
                 static_cast<double>(rr.counts.logical_uncompressed_bytes) / cBps
             );
@@ -845,13 +858,15 @@ int main(int argc, char** argv) {
     };
 
     if (!compress_t.empty()) {
+        double mean_elapsed_p = harmonic_mean(pack_t);
         double mean_elapsed_c = harmonic_mean(compress_t);
         double mean_elapsed_d = harmonic_mean(decompress_t);
         double mean_rt_nc = harmonic_mean(rt_nc_t);
         double mean_rt_c = harmonic_mean(rt_c_t);
 
         std::stringstream ss;
-        ss << "means: compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s"
+        ss << "means: pack: " << format_nbytes(logical_bytes / mean_elapsed_p) << "/s"
+           << " | compress: " << format_nbytes(logical_bytes / mean_elapsed_c) << "/s"
            << " | decompress: " << format_nbytes(logical_bytes / mean_elapsed_d) << "/s"
            << " | rt(nocomp): "
            << format_nbytes((2.0 * static_cast<double>(logical_bytes)) / mean_rt_nc)

From 73a17dd5869eb198e8f3573513564b09bc7eae69 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Nov 2025 02:52:25 -0800
Subject: [PATCH 24/26] Make individual column packing non-blocking

---
 cpp/benchmarks/bench_comp_comm.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 4b39b1ea4..d40425c6e 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -249,7 +249,6 @@ static std::unique_ptr<PackedData> pack_table_to_packed(
     auto ret = std::make_unique<PackedData>(
         std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream)
     );
-    ret->data->stream().synchronize();
     return ret;
 }
 
@@ -407,6 +406,8 @@ BuffersToSend make_packed_items(
     if (mode == PackMode::Table) {
         auto item = PackedItem{};
         item.packed = pack_table_to_packed(table.view(), stream, br);
+        // Ensure the pack is completed before using the buffer downstream
+        ensure_ready(*item.packed->data);
         ret.total_uncompressed_bytes += item.packed->data->size;
         ret.total_payload_bytes += item.packed->data->size;
         ret.items.emplace_back(std::move(item));
@@ -420,6 +421,10 @@ BuffersToSend make_packed_items(
             ret.total_payload_bytes += item.packed->data->size;
             ret.items.emplace_back(std::move(item));
         }
+        // Synchronize all packs after launching them to avoid per-iteration blocking
+        for (std::size_t i = 0; i < ret.items.size(); ++i) {
+            ensure_ready(*ret.items[i].packed->data);
+        }
     }
     return ret;
 }

From bbb555ab1e966f1093601217b5c51b216ab9d606 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Nov 2025 03:21:31 -0800
Subject: [PATCH 25/26] Reorganization and cleanup

---
 cpp/benchmarks/bench_comp_comm.cpp | 424 +++++++++++++----------------
 1 file changed, 196 insertions(+), 228 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index d40425c6e..64e5dcb40 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -39,7 +39,6 @@
 #include <rapidsmpf/communicator/ucxx_utils.hpp>
 #include <rapidsmpf/config.hpp>
 #include <rapidsmpf/error.hpp>
-#include <rapidsmpf/statistics.hpp>
 #include <rapidsmpf/utils.hpp>
 
 #ifdef RAPIDSMPF_HAVE_CUPTI
@@ -72,186 +71,6 @@ struct Args {
     std::string cupti_csv_prefix;
 };
 
-std::vector<std::string> expand_glob(std::string const& pattern) {
-    std::vector<std::string> files;
-    glob_t glob_result{};
-    int rc = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result);
-    if (rc == 0) {
-        for (std::size_t i = 0; i < glob_result.gl_pathc; ++i) {
-            files.emplace_back(glob_result.gl_pathv[i]);
-        }
-    }
-    globfree(&glob_result);
-    return files;
-}
-
-std::size_t parse_nbytes(std::string const& s) {
-    // Simple parser: supports suffixes KiB, MiB, GiB, KB, MB, GB, or no suffix.
-    auto to_lower = [](char c) { return static_cast<char>(std::tolower(c)); };
-    std::string v;
-    v.reserve(s.size());
-    for (char c : s)
-        v.push_back(to_lower(c));
-
-    std::size_t mult = 1;
-    if (v.ends_with("kib")) {
-        mult = 1ull << 10;
-        v = v.substr(0, v.size() - 3);
-    } else if (v.ends_with("mib")) {
-        mult = 1ull << 20;
-        v = v.substr(0, v.size() - 3);
-    } else if (v.ends_with("gib")) {
-        mult = 1ull << 30;
-        v = v.substr(0, v.size() - 3);
-    } else if (v.ends_with("kb")) {
-        mult = 1000ull;
-        v = v.substr(0, v.size() - 2);
-    } else if (v.ends_with("mb")) {
-        mult = 1000ull * 1000ull;
-        v = v.substr(0, v.size() - 2);
-    } else if (v.ends_with("gb")) {
-        mult = 1000ull * 1000ull * 1000ull;
-        v = v.substr(0, v.size() - 2);
-    }
-
-    return static_cast<std::size_t>(std::stoll(v)) * mult;
-}
-
-KvParams parse_kv_params(std::string const& kv) {
-    KvParams p{};
-    if (kv.empty())
-        return p;
-    std::size_t start = 0;
-    while (start < kv.size()) {
-        auto comma = kv.find(',', start);
-        auto part = kv.substr(
-            start, comma == std::string::npos ? std::string::npos : comma - start
-        );
-        auto eq = part.find('=');
-        if (eq != std::string::npos) {
-            std::string key = part.substr(0, eq);
-            std::string val = part.substr(eq + 1);
-            if (key == "chunk_size")
-                p.chunk_size = parse_nbytes(val);
-            else if (key == "delta")
-                p.cascaded_delta = std::stoi(val);
-            else if (key == "rle")
-                p.cascaded_rle = std::stoi(val);
-            else if (key == "bitpack")
-                p.cascaded_bitpack = std::stoi(val);
-        }
-        if (comma == std::string::npos)
-            break;
-        start = comma + 1;
-    }
-    return p;
-}
-
-struct PhaseThroughputs {
-    double compress_Bps{0.0};
-    double decompress_Bps{0.0};
-    double comp_send_Bps{0.0};
-    double recv_decomp_Bps{0.0};
-    double send_only_Bps{0.0};
-    double recv_only_Bps{0.0};
-};
-
-static inline void ensure_ready(Buffer& buf) {
-    if (!buf.is_latest_write_done()) {
-        buf.stream().synchronize();
-    }
-}
-
-static inline std::unique_ptr<Buffer> alloc_device(
-    BufferResource* br, rmm::cuda_stream_view stream, std::size_t size
-) {
-    auto res = br->reserve_or_fail(size, MemoryType::DEVICE);
-    return br->allocate(size, stream, res);
-}
-
-static inline std::unique_ptr<Buffer> alloc_and_copy_device(
-    BufferResource* br, rmm::cuda_stream_view stream, Buffer const& src
-) {
-    auto out = alloc_device(br, stream, src.size);
-    buffer_copy(*out, src, src.size);
-    return out;
-}
-
-// Non-blocking helpers to exchange headers and payloads concurrently.
-static inline std::uint64_t exchange_u64_header(
-    std::shared_ptr<Communicator> const& comm,
-    BufferResource* br,
-    rmm::cuda_stream_view stream,
-    Rank peer,
-    Tag send_tag,
-    Tag recv_tag,
-    std::uint64_t send_value
-) {
-    // Post header send
-    auto send_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-    auto send_hdr = br->allocate(sizeof(std::uint64_t), stream, send_hdr_res);
-    send_hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
-        std::memcpy(p, &send_value, sizeof(std::uint64_t));
-    });
-    ensure_ready(*send_hdr);
-    auto send_hdr_fut = comm->send(std::move(send_hdr), peer, send_tag);
-    // Post header recv
-    auto recv_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
-    auto recv_hdr = br->allocate(sizeof(std::uint64_t), stream, recv_hdr_res);
-    ensure_ready(*recv_hdr);
-    auto recv_hdr_fut = comm->recv(peer, recv_tag, std::move(recv_hdr));
-    // Wait recv, read value, then ensure send completion
-    auto recv_hdr_buf = comm->wait(std::move(recv_hdr_fut));
-    std::uint64_t recv_value = 0;
-    {
-        auto* p = recv_hdr_buf->exclusive_data_access();
-        std::memcpy(&recv_value, p, sizeof(std::uint64_t));
-        recv_hdr_buf->unlock();
-    }
-    std::ignore = comm->wait(std::move(send_hdr_fut));
-    return recv_value;
-}
-
-static inline void exchange_payload(
-    std::shared_ptr<Communicator> const& comm,
-    BufferResource* br,
-    rmm::cuda_stream_view stream,
-    Rank peer,
-    Tag send_tag,
-    Tag recv_tag,
-    std::unique_ptr<Buffer> send_buf,  // may be null if no data to send
-    std::size_t recv_size  // may be zero if no data to recv
-) {
-    std::unique_ptr<Communicator::Future> data_send_fut;
-    std::unique_ptr<Communicator::Future> data_recv_fut;
-    if (recv_size > 0) {
-        auto recv_buf = alloc_device(br, stream, recv_size);
-        ensure_ready(*recv_buf);
-        data_recv_fut = comm->recv(peer, recv_tag, std::move(recv_buf));
-    }
-    if (send_buf && send_buf->size > 0) {
-        ensure_ready(*send_buf);
-        data_send_fut = comm->send(std::move(send_buf), peer, send_tag);
-    }
-    if (data_recv_fut) {
-        (void)comm->wait(std::move(data_recv_fut));
-    }
-    if (data_send_fut) {
-        std::ignore = comm->wait(std::move(data_send_fut));
-    }
-}
-
-// Convenience: wrap metadata + gpu_data into rapidsmpf::PackedData
-static std::unique_ptr<PackedData> pack_table_to_packed(
-    cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br
-) {
-    auto packed = cudf::pack(tv, stream, br->device_mr());
-    auto ret = std::make_unique<PackedData>(
-        std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream)
-    );
-    return ret;
-}
-
 struct ArgumentParser {
     ArgumentParser(int argc, char* const* argv, bool use_mpi) {
         int rank = 0;
@@ -378,6 +197,68 @@ struct ArgumentParser {
         return args_;
     }
 
+    static std::size_t parse_nbytes(std::string const& s) {
+        // Simple parser: supports suffixes KiB, MiB, GiB, KB, MB, GB, or no suffix.
+        auto to_lower = [](char c) { return static_cast<char>(std::tolower(c)); };
+        std::string v;
+        v.reserve(s.size());
+        for (char c : s)
+            v.push_back(to_lower(c));
+
+        std::size_t mult = 1;
+        if (v.ends_with("kib")) {
+            mult = 1ull << 10;
+            v = v.substr(0, v.size() - 3);
+        } else if (v.ends_with("mib")) {
+            mult = 1ull << 20;
+            v = v.substr(0, v.size() - 3);
+        } else if (v.ends_with("gib")) {
+            mult = 1ull << 30;
+            v = v.substr(0, v.size() - 3);
+        } else if (v.ends_with("kb")) {
+            mult = 1000ull;
+            v = v.substr(0, v.size() - 2);
+        } else if (v.ends_with("mb")) {
+            mult = 1000ull * 1000ull;
+            v = v.substr(0, v.size() - 2);
+        } else if (v.ends_with("gb")) {
+            mult = 1000ull * 1000ull * 1000ull;
+            v = v.substr(0, v.size() - 2);
+        }
+
+        return static_cast<std::size_t>(std::stoll(v)) * mult;
+    }
+
+    static KvParams parse_kv_params(std::string const& kv) {
+        KvParams p{};
+        if (kv.empty())
+            return p;
+        std::size_t start = 0;
+        while (start < kv.size()) {
+            auto comma = kv.find(',', start);
+            auto part = kv.substr(
+                start, comma == std::string::npos ? std::string::npos : comma - start
+            );
+            auto eq = part.find('=');
+            if (eq != std::string::npos) {
+                std::string key = part.substr(0, eq);
+                std::string val = part.substr(eq + 1);
+                if (key == "chunk_size")
+                    p.chunk_size = parse_nbytes(val);
+                else if (key == "delta")
+                    p.cascaded_delta = std::stoi(val);
+                else if (key == "rle")
+                    p.cascaded_rle = std::stoi(val);
+                else if (key == "bitpack")
+                    p.cascaded_bitpack = std::stoi(val);
+            }
+            if (comma == std::string::npos)
+                break;
+            start = comma + 1;
+        }
+        return p;
+    }
+
   private:
     Args args_{};
 };
@@ -396,6 +277,54 @@ struct BuffersToSend {
     std::size_t total_payload_bytes{0};
 };
 
+struct Timings {
+    double pack_s{0.0};
+    double compress_s{0.0};
+    double decompress_s{0.0};
+    // Round-trip totals measured at initiator
+    double rt_nocomp_s{0.0};
+    double rt_comp_s{0.0};
+};
+
+struct Counters {
+    std::size_t logical_uncompressed_bytes{0};
+    std::size_t logical_compressed_bytes{0};
+};
+
+struct RunResult {
+    Timings times;
+    Counters counts;
+};
+
+std::vector<std::string> expand_glob(std::string const& pattern) {
+    std::vector<std::string> files;
+    glob_t glob_result{};
+    int rc = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result);
+    if (rc == 0) {
+        for (std::size_t i = 0; i < glob_result.gl_pathc; ++i) {
+            files.emplace_back(glob_result.gl_pathv[i]);
+        }
+    }
+    globfree(&glob_result);
+    return files;
+}
+
+static inline void ensure_ready(Buffer& buf) {
+    if (!buf.is_latest_write_done()) {
+        buf.stream().synchronize();
+    }
+}
+
+static std::unique_ptr<PackedData> pack_table_to_packed(
+    cudf::table_view tv, rmm::cuda_stream_view stream, BufferResource* br
+) {
+    auto packed = cudf::pack(tv, stream, br->device_mr());
+    auto ret = std::make_unique<PackedData>(
+        std::move(packed.metadata), br->move(std::move(packed.gpu_data), stream)
+    );
+    return ret;
+}
+
 BuffersToSend make_packed_items(
     cudf::table const& table,
     PackMode mode,
@@ -406,7 +335,6 @@ BuffersToSend make_packed_items(
     if (mode == PackMode::Table) {
         auto item = PackedItem{};
         item.packed = pack_table_to_packed(table.view(), stream, br);
-        // Ensure the pack is completed before using the buffer downstream
         ensure_ready(*item.packed->data);
         ret.total_uncompressed_bytes += item.packed->data->size;
         ret.total_payload_bytes += item.packed->data->size;
@@ -421,7 +349,6 @@ BuffersToSend make_packed_items(
             ret.total_payload_bytes += item.packed->data->size;
             ret.items.emplace_back(std::move(item));
         }
-        // Synchronize all packs after launching them to avoid per-iteration blocking
         for (std::size_t i = 0; i < ret.items.size(); ++i) {
             ensure_ready(*ret.items[i].packed->data);
         }
@@ -429,43 +356,94 @@ BuffersToSend make_packed_items(
     return ret;
 }
 
-// Send/recv helpers: send a header (compressed size) as host buffer.
-struct SizeHeader {
-    std::uint64_t bytes;
-};
+static inline std::unique_ptr<Buffer> alloc_device(
+    BufferResource* br, rmm::cuda_stream_view stream, std::size_t size
+) {
+    auto res = br->reserve_or_fail(size, MemoryType::DEVICE);
+    return br->allocate(size, stream, res);
+}
 
-struct Timings {
-    double pack_s{0.0};
-    double compress_s{0.0};
-    double decompress_s{0.0};
-    // Round-trip totals measured at initiator
-    double rt_nocomp_s{0.0};
-    double rt_comp_s{0.0};
-};
+static inline std::unique_ptr<Buffer> alloc_and_copy_device(
+    BufferResource* br, rmm::cuda_stream_view stream, Buffer const& src
+) {
+    auto out = alloc_device(br, stream, src.size);
+    buffer_copy(*out, src, src.size);
+    return out;
+}
 
-// Returns timings and bytes counters
-struct Counters {
-    std::size_t logical_uncompressed_bytes{0};
-    std::size_t logical_compressed_bytes{0};
-};
+static inline std::uint64_t exchange_u64_header(
+    std::shared_ptr<Communicator> const& comm,
+    BufferResource* br,
+    rmm::cuda_stream_view stream,
+    Rank peer,
+    Tag send_tag,
+    Tag recv_tag,
+    std::uint64_t send_value
+) {
+    // Post header send
+    auto send_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+    auto send_hdr = br->allocate(sizeof(std::uint64_t), stream, send_hdr_res);
+    send_hdr->write_access([&](std::byte* p, rmm::cuda_stream_view) {
+        std::memcpy(p, &send_value, sizeof(std::uint64_t));
+    });
+    ensure_ready(*send_hdr);
+    auto send_hdr_fut = comm->send(std::move(send_hdr), peer, send_tag);
+    // Post header recv
+    auto recv_hdr_res = br->reserve_or_fail(sizeof(std::uint64_t), MemoryType::HOST);
+    auto recv_hdr = br->allocate(sizeof(std::uint64_t), stream, recv_hdr_res);
+    ensure_ready(*recv_hdr);
+    auto recv_hdr_fut = comm->recv(peer, recv_tag, std::move(recv_hdr));
+    // Wait recv, read value, then ensure send completion
+    auto recv_hdr_buf = comm->wait(std::move(recv_hdr_fut));
+    std::uint64_t recv_value = 0;
+    {
+        auto* p = recv_hdr_buf->exclusive_data_access();
+        std::memcpy(&recv_value, p, sizeof(std::uint64_t));
+        recv_hdr_buf->unlock();
+    }
+    std::ignore = comm->wait(std::move(send_hdr_fut));
+    return recv_value;
+}
 
-struct RunResult {
-    Timings times;
-    Counters counts;
-};
+static inline void exchange_payload(
+    std::shared_ptr<Communicator> const& comm,
+    BufferResource* br,
+    rmm::cuda_stream_view stream,
+    Rank peer,
+    Tag send_tag,
+    Tag recv_tag,
+    std::unique_ptr<Buffer> send_buf,
+    std::size_t recv_size
+) {
+    std::unique_ptr<Communicator::Future> data_send_fut;
+    std::unique_ptr<Communicator::Future> data_recv_fut;
+    if (recv_size > 0) {
+        auto recv_buf = alloc_device(br, stream, recv_size);
+        ensure_ready(*recv_buf);
+        data_recv_fut = comm->recv(peer, recv_tag, std::move(recv_buf));
+    }
+    if (send_buf && send_buf->size > 0) {
+        ensure_ready(*send_buf);
+        data_send_fut = comm->send(std::move(send_buf), peer, send_tag);
+    }
+    if (data_recv_fut) {
+        std::ignore = comm->wait(std::move(data_recv_fut));
+    }
+    if (data_send_fut) {
+        std::ignore = comm->wait(std::move(data_send_fut));
+    }
+}
 
 RunResult run_once(
     std::shared_ptr<Communicator> const& comm,
     Args const& args,
     rmm::cuda_stream_view stream,
     BufferResource* br,
-    std::shared_ptr<Statistics> const& statistics,
     cudf::table const& table,
     PackMode pack_mode,
     NvcompCodec& codec,
     std::uint64_t run_index
 ) {
-    (void)statistics;
     auto const nranks = comm->nranks();
     auto const rank = comm->rank();
     auto const dst = static_cast<Rank>((rank + 1) % nranks);
@@ -521,7 +499,6 @@ RunResult run_once(
             [&codec, &packed, i, in_bytes, &comp_output_sizes, &comp_streams, br](
                 std::byte* out_ptr, rmm::cuda_stream_view out_stream
             ) {
-                (void)out_ptr;  // pointer used below
                 // Lock input for raw pointer access
                 auto* in_raw = packed.items[i].packed->data->exclusive_data_access();
                 codec.compress(
@@ -532,7 +509,6 @@ RunResult run_once(
                     out_stream,
                     br
                 );
-                // Defer synchronization and unlock; record stream for later sync
                 comp_streams[i] = out_stream;
             }
         );
@@ -548,8 +524,7 @@ RunResult run_once(
     }
     auto t1 = Clock::now();
 
-    // Phase A (RTT no compression): ping-pong per op (sequential per item to avoid
-    // deadlocks)
+    // Phase A (RTT no compression): ping-pong per op (sequential per item)
     Duration rt_nc_total{0};
     for (std::uint64_t op = 0; op < args.num_ops; ++op) {
         bool initiator =
@@ -559,7 +534,7 @@ RunResult run_once(
         Tag send_tag_nc = initiator ? tag_ping_nc : tag_pong_nc;
         Tag recv_tag_nc = initiator ? tag_pong_nc : tag_ping_nc;
         for (std::size_t i = 0; i < nocomp_payloads.size(); ++i) {
-            // Exchange payload sizes to avoid assuming symmetric sizes across ranks
+            // Exchange payload sizes
             std::uint64_t local_sz = static_cast<std::uint64_t>(nocomp_payloads[i]->size);
             std::uint64_t remote_sz = exchange_u64_header(
                 comm, br, stream, peer, send_tag_nc, recv_tag_nc, local_sz
@@ -622,8 +597,7 @@ RunResult run_once(
         rt_c_total += (rt_end - rt_start);
     }
 
-    // Decompress received buffers (simulate by decompressing our own produced outputs in
-    // symmetric setup)
+    // Decompress received buffers (simulate by decompressing our own produced outputs)
     auto c0 = Clock::now();
     std::vector<rmm::cuda_stream_view> decomp_streams(packed.items.size());
     for (std::size_t i = 0; i < packed.items.size(); ++i) {
@@ -645,7 +619,6 @@ RunResult run_once(
                     out_bytes,
                     out_stream
                 );
-                // Defer unlock until after per-stream synchronization
                 decomp_streams[i] = out_stream;
             }
         );
@@ -668,7 +641,7 @@ RunResult run_once(
     result.times.rt_comp_s = rt_c_total.count();
     result.times.decompress_s = std::chrono::duration<double>(c1 - c0).count();
 
-    // Use payload (device) bytes as the logical uncompressed size for throughput
+    // Use payload bytes as the logical uncompressed size for throughput
     result.counts.logical_uncompressed_bytes = packed.total_payload_bytes * args.num_ops;
     result.counts.logical_compressed_bytes =
         std::accumulate(
@@ -682,10 +655,13 @@ RunResult run_once(
 
 int main(int argc, char** argv) {
     // Check if we should use bootstrap mode with rrun
+    // This is determined by checking for RAPIDSMPF_RANK environment variable
     bool use_bootstrap = std::getenv("RAPIDSMPF_RANK") != nullptr;
 
     int provided = 0;
     if (!use_bootstrap) {
+        // Explicitly initialize MPI with thread support, as this is needed for both mpi
+        // and ucxx communicators.
         RAPIDSMPF_MPI(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided));
         RAPIDSMPF_EXPECTS(
             provided == MPI_THREAD_MULTIPLE,
@@ -711,10 +687,12 @@ int main(int argc, char** argv) {
         comm = std::make_shared<MPI>(MPI_COMM_WORLD, options);
     } else if (args.comm_type == "ucxx") {
         if (use_bootstrap) {
+            // Launched with rrun - use bootstrap backend
             comm = rapidsmpf::bootstrap::create_ucxx_comm(
                 rapidsmpf::bootstrap::Backend::AUTO, options
             );
         } else {
+            // Launched with mpirun - use MPI bootstrap
             comm = rapidsmpf::ucxx::init_using_mpi(MPI_COMM_WORLD, options);
         }
     } else {
@@ -725,12 +703,11 @@ int main(int argc, char** argv) {
     auto& log = comm->logger();
     rmm::cuda_stream_view stream = cudf::get_default_stream();
 
-    // RMM setup
     auto const mr_stack = set_current_rmm_stack(args.rmm_mr);
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
     BufferResource br{mr};
 
-    // Hardware info
+    // Print benchmark/hardware info.
     {
         std::stringstream ss;
         auto const cur_dev = rmm::get_current_cuda_device().value();
@@ -749,9 +726,8 @@ int main(int argc, char** argv) {
         log.print(ss.str());
     }
 
-    // Stats and CUPTI
-    auto stats = std::make_shared<rapidsmpf::Statistics>(/* enable = */ false);
 #ifdef RAPIDSMPF_HAVE_CUPTI
+    // Create CUPTI monitor if enabled
     std::unique_ptr<rapidsmpf::CuptiMonitor> cupti_monitor;
     if (args.enable_cupti_monitoring) {
         cupti_monitor = std::make_unique<rapidsmpf::CuptiMonitor>();
@@ -776,16 +752,13 @@ int main(int argc, char** argv) {
         );
     log.print("Rank " + std::to_string(comm->rank()) + " reading: " + my_file);
 
-    // Read Parquet into cudf::table
     cudf::io::parquet_reader_options reader_opts =
         cudf::io::parquet_reader_options::builder(cudf::io::source_info{my_file});
     auto table_with_md = cudf::io::read_parquet(reader_opts);
     auto& table = table_with_md.tbl;
 
-    // Prepare codec
     auto codec = make_codec(args.algo, args.params);
 
-    // Runs
     std::vector<double> pack_t, compress_t, decompress_t, rt_nc_t, rt_c_t;
     pack_t.reserve(args.num_runs);
     compress_t.reserve(args.num_runs);
@@ -797,11 +770,7 @@ int main(int argc, char** argv) {
     std::size_t logical_compressed_bytes_last = 0;
 
     for (std::uint64_t i = 0; i < args.num_warmups + args.num_runs; ++i) {
-        if (i == args.num_warmups + args.num_runs - 1) {
-            stats = std::make_shared<rapidsmpf::Statistics>(/* enable = */ true);
-        }
-        auto rr =
-            run_once(comm, args, stream, &br, stats, *table, args.pack_mode, *codec, i);
+        auto rr = run_once(comm, args, stream, &br, *table, args.pack_mode, *codec, i);
 
         logical_bytes = rr.counts.logical_uncompressed_bytes;
         double pBps =
@@ -854,7 +823,6 @@ int main(int argc, char** argv) {
         }
     }
 
-    // Means
     auto harmonic_mean = [](std::vector<double> const& v) {
         double denom_sum = 0.0;
         for (auto x : v)

From c46305ff5e193b7b4c60faf16b4eb03aac39d987 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Nov 2025 12:16:37 -0800
Subject: [PATCH 26/26] Support (de-)compressing and transferring data only

---
 cpp/benchmarks/bench_comp_comm.cpp | 106 +++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 13 deletions(-)

diff --git a/cpp/benchmarks/bench_comp_comm.cpp b/cpp/benchmarks/bench_comp_comm.cpp
index 64e5dcb40..7fd1af243 100644
--- a/cpp/benchmarks/bench_comp_comm.cpp
+++ b/cpp/benchmarks/bench_comp_comm.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 #include <rapidsmpf/bootstrap/bootstrap.hpp>
@@ -69,6 +70,7 @@ struct Args {
     std::uint64_t num_ops{1};
     bool enable_cupti_monitoring{false};
     std::string cupti_csv_prefix;
+    bool data_only{false};
 };
 
 struct ArgumentParser {
@@ -81,8 +83,8 @@ struct ArgumentParser {
         try {
             int opt;
             // C: comm, r: runs, w: warmups, m: rmm, F: files, P: pack mode, A: algo, K:
-            // kv, p: ops, M: cupti, h: help
-            while ((opt = getopt(argc, argv, "C:r:w:m:F:P:A:K:p:M:h")) != -1) {
+            // kv, p: ops, M: cupti, D: data-only, h: help
+            while ((opt = getopt(argc, argv, "C:r:w:m:F:P:A:K:p:M:Dh")) != -1) {
                 switch (opt) {
                 case 'C':
                     args_.comm_type = std::string{optarg};
@@ -140,6 +142,9 @@ struct ArgumentParser {
                     args_.enable_cupti_monitoring = true;
                     args_.cupti_csv_prefix = std::string{optarg};
                     break;
+                case 'D':
+                    args_.data_only = true;
+                    break;
                 case 'h':
                 default:
                     {
@@ -160,6 +165,8 @@ struct ArgumentParser {
                               "chunk_size=1MiB,delta=1,rle=1,bitpack=1\n"
                            << "  -p <num>     Number of concurrent ops (default: 1)\n"
                            << "  -M <path>    CUPTI CSV path prefix (enable CUPTI)\n"
+                           << "  -D           Data-only mode (compress/transfer data "
+                              "buffers only)\n"
                            << "  -h           Show this help\n";
                         if (rank == 0)
                             std::cerr << ss.str();
@@ -266,6 +273,8 @@ struct ArgumentParser {
 struct PackedItem {
     // Ownership: we store size and buffer pointer for the packed payload
     std::unique_ptr<PackedData> packed;  // original packed cudf table/column
+    // Data-only mode: directly owned GPU buffer containing column/table data
+    std::unique_ptr<Buffer> raw_data;
 };
 
 struct BuffersToSend {
@@ -277,6 +286,14 @@ struct BuffersToSend {
     std::size_t total_payload_bytes{0};
 };
 
+static inline Buffer& item_data_buffer(PackedItem& it) {
+    return it.packed ? *it.packed->data : *it.raw_data;
+}
+
+static inline Buffer const& item_data_buffer(PackedItem const& it) {
+    return it.packed ? *it.packed->data : *it.raw_data;
+}
+
 struct Timings {
     double pack_s{0.0};
     double compress_s{0.0};
@@ -356,6 +373,67 @@ BuffersToSend make_packed_items(
     return ret;
 }
 
+// Collect leaf data buffers recursively (device memory ranges) from a column.
+static void collect_leaf_data_buffers(
+    cudf::column_view const& col,
+    std::vector<std::pair<void const*, std::size_t>>& buffers
+) {
+    for (auto it = col.child_begin(); it != col.child_end(); ++it) {
+        collect_leaf_data_buffers(*it, buffers);
+    }
+    if (col.num_children() == 0 && col.size() > 0) {
+        auto const elem_size = static_cast<std::size_t>(cudf::size_of(col.type()));
+        if (elem_size == 0) {
+            return;
+        }
+        auto const* base = col.head<std::uint8_t>();
+        if (base == nullptr) {
+            return;
+        }
+        auto const byte_offset = static_cast<std::size_t>(col.offset()) * elem_size;
+        auto const num_bytes = static_cast<std::size_t>(col.size()) * elem_size;
+        buffers.emplace_back(static_cast<void const*>(base + byte_offset), num_bytes);
+    }
+}
+
+static std::vector<std::pair<void const*, std::size_t>> collect_table_data_buffers(
+    cudf::table_view const& tv
+) {
+    std::vector<std::pair<void const*, std::size_t>> out;
+    for (auto const& col : tv) {
+        collect_leaf_data_buffers(col, out);
+    }
+    return out;
+}
+
+// Data-only path: build items consisting solely of copies of each leaf device buffer.
+BuffersToSend make_data_only_items(
+    cudf::table const& table, rmm::cuda_stream_view stream, BufferResource* br
+) {
+    BuffersToSend ret{};
+    auto leaves = collect_table_data_buffers(table.view());
+    ret.items.reserve(leaves.size());
+    for (auto const& [src_ptr, nbytes] : leaves) {
+        if (nbytes == 0) {
+            continue;
+        }
+        auto reservation = br->reserve_or_fail(nbytes, MemoryType::DEVICE);
+        auto buf = br->allocate(nbytes, stream, reservation);
+        buf->write_access([&](std::byte* dst, rmm::cuda_stream_view s) {
+            RAPIDSMPF_CUDA_TRY_ALLOC(
+                cudaMemcpyAsync(dst, src_ptr, nbytes, cudaMemcpyDefault, s)
+            );
+        });
+        ensure_ready(*buf);
+        PackedItem item{};
+        item.raw_data = std::move(buf);
+        ret.total_uncompressed_bytes += nbytes;
+        ret.total_payload_bytes += nbytes;
+        ret.items.emplace_back(std::move(item));
+    }
+    return ret;
+}
+
 static inline std::unique_ptr<Buffer> alloc_device(
     BufferResource* br, rmm::cuda_stream_view stream, std::size_t size
 ) {
@@ -456,7 +534,8 @@ RunResult run_once(
 
     // Pack data per iteration
     auto p0 = Clock::now();
-    auto packed = make_packed_items(table, pack_mode, stream, br);
+    auto packed = args.data_only ? make_data_only_items(table, stream, br)
+                                 : make_packed_items(table, pack_mode, stream, br);
     auto p1 = Clock::now();
 
     // Clone packed items into raw device buffers for repeated ops
@@ -466,9 +545,10 @@ RunResult run_once(
         // Copy metadata + data into a contiguous device buffer for pure send path?
         // For pure send/recv, we only send the device payload; metadata isn't needed for
         // metrics. We'll send the packed->data buffer.
-        auto reservation = br->reserve_or_fail(it.packed->data->size, MemoryType::DEVICE);
-        auto buf = br->allocate(it.packed->data->size, stream, reservation);
-        buffer_copy(*buf, *it.packed->data, it.packed->data->size);
+        auto const& src_buf = item_data_buffer(it);
+        auto reservation = br->reserve_or_fail(src_buf.size, MemoryType::DEVICE);
+        auto buf = br->allocate(src_buf.size, stream, reservation);
+        buffer_copy(*buf, src_buf, src_buf.size);
         nocomp_payloads.emplace_back(std::move(buf));
     }
 
@@ -477,7 +557,7 @@ RunResult run_once(
     std::vector<std::size_t> comp_output_sizes(packed.items.size());
     comp_outputs.reserve(packed.items.size());
     for (std::size_t i = 0; i < packed.items.size(); ++i) {
-        auto const in_bytes = packed.items[i].packed->data->size;
+        auto const in_bytes = item_data_buffer(packed.items[i]).size;
         std::size_t const max_out =
             (in_bytes == 0) ? 1 : codec.get_max_compressed_bytes(in_bytes, stream);
         auto reservation = br->reserve_or_fail(max_out, MemoryType::DEVICE);
@@ -489,7 +569,7 @@ RunResult run_once(
     // Compress all items (single batch) on stream
     std::vector<rmm::cuda_stream_view> comp_streams(packed.items.size());
     for (std::size_t i = 0; i < packed.items.size(); ++i) {
-        auto const in_bytes = packed.items[i].packed->data->size;
+        auto const in_bytes = item_data_buffer(packed.items[i]).size;
         if (in_bytes == 0) {
             comp_output_sizes[i] = 0;
             continue;
@@ -500,7 +580,7 @@ RunResult run_once(
                 std::byte* out_ptr, rmm::cuda_stream_view out_stream
             ) {
                 // Lock input for raw pointer access
-                auto* in_raw = packed.items[i].packed->data->exclusive_data_access();
+                auto* in_raw = item_data_buffer(packed.items[i]).exclusive_data_access();
                 codec.compress(
                     static_cast<void const*>(in_raw),
                     in_bytes,
@@ -515,12 +595,12 @@ RunResult run_once(
     }
     // Synchronize streams and unlock inputs
     for (std::size_t i = 0; i < packed.items.size(); ++i) {
-        auto const in_bytes = packed.items[i].packed->data->size;
+        auto const in_bytes = item_data_buffer(packed.items[i]).size;
         if (in_bytes == 0) {
             continue;
         }
         RAPIDSMPF_CUDA_TRY(cudaStreamSynchronize(comp_streams[i].value()));
-        packed.items[i].packed->data->unlock();
+        item_data_buffer(packed.items[i]).unlock();
     }
     auto t1 = Clock::now();
 
@@ -601,7 +681,7 @@ RunResult run_once(
     auto c0 = Clock::now();
     std::vector<rmm::cuda_stream_view> decomp_streams(packed.items.size());
     for (std::size_t i = 0; i < packed.items.size(); ++i) {
-        auto const out_bytes = packed.items[i].packed->data->size;
+        auto const out_bytes = item_data_buffer(packed.items[i]).size;
         if (out_bytes == 0) {
             continue;
         }
@@ -625,7 +705,7 @@ RunResult run_once(
     }
     // Synchronize each decomp stream and then unlock the corresponding input
     for (std::size_t i = 0; i < packed.items.size(); ++i) {
-        auto const out_bytes = packed.items[i].packed->data->size;
+        auto const out_bytes = item_data_buffer(packed.items[i]).size;
         if (out_bytes == 0) {
             continue;
         }