From 3b842c10ce17c2b78b3b6218278ea70e2cde49a5 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 12 Jan 2026 13:53:29 -0800
Subject: [PATCH 01/10] reenable chunked pack benchmarks

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index dde7f2f48..40db6d52b 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -1,5 +1,5 @@
 /**
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -85,8 +85,7 @@ static void BM_Pack_device(benchmark::State& state) {
  * @brief Benchmark for cudf::pack with pinned memory
  */
 static void BM_Pack_pinned(benchmark::State& state) {
-    state.SkipWithMessage("Skipping until cudf#20886 is fixed");
-    /* if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
         return;
     }
@@ -102,7 +101,7 @@ static void BM_Pack_pinned(benchmark::State& state) {
     };
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
-    run_pack(state, table_size_mb, pool_mr, pinned_mr, stream); */
+    run_pack(state, table_size_mb, pool_mr, pinned_mr, stream);
 }
 
 /**
@@ -206,8 +205,7 @@ static void BM_ChunkedPack_device(benchmark::State& state) {
  * @brief Benchmark for cudf::chunked_pack pinned memory
  */
 static void BM_ChunkedPack_pinned(benchmark::State& state) {
-    state.SkipWithMessage("Skipping until cudf#20886 is fixed");
-    /*     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
         return;
     }
@@ -228,7 +226,7 @@ static void BM_ChunkedPack_pinned(benchmark::State& state) {
 
     run_chunked_pack(
         state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream
-    ); */
+    );
 }
 
 // Custom argument generator for the benchmark
@@ -286,8 +284,7 @@ static void BM_ChunkedPack_fixed_table_device(benchmark::State& state) {
  * and keeping table size fixed at 1GB
  */
 static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) {
-    state.SkipWithMessage("Skipping until cudf#20886 is fixed");
-    /* if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
         return;
     }
@@ -305,7 +302,7 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) {
 
     run_chunked_pack(
         state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream
-    ); */
+    );
 }
 
 // Custom argument generator for the benchmark

From aed3a27bd54990a03ad683792507352ecee9a206 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 12 Jan 2026 16:38:23 -0800
Subject: [PATCH 02/10] extending bench

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 200 +++++++++++++++++++++++++++++-----
 1 file changed, 173 insertions(+), 27 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index 40db6d52b..4c136fc1c 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -19,6 +19,8 @@
 #include <rmm/mr/cuda_async_memory_resource.hpp>
 #include <rmm/mr/pool_memory_resource.hpp>
 
+#include <rapidsmpf/memory/host_buffer.hpp>
+#include <rapidsmpf/memory/host_memory_resource.hpp>
 #include <rapidsmpf/memory/pinned_memory_resource.hpp>
 
 #include "utils/random_data.hpp"
@@ -33,11 +35,12 @@ constexpr std::size_t MB = 1024 * 1024;
  * @param pack_mr The memory resource for the packed data
  * @param stream The CUDA stream to use
  */
-void run_pack(
+void run_pack_and_copy(
     benchmark::State& state,
     std::size_t table_size_mb,
     rmm::device_async_resource_ref table_mr,
     rmm::device_async_resource_ref pack_mr,
+    rmm::host_async_resource_ref dest_mr,
     rmm::cuda_stream_view stream
 ) {
     auto const table_size_bytes = table_size_mb * MB;
@@ -49,11 +52,21 @@ void run_pack(
 
     // Warm up
     auto warm_up = cudf::pack(table.view(), stream, pack_mr);
+
+    rapidsmpf::HostBuffer dest(warm_up.gpu_data->size(), stream, dest_mr);
     stream.synchronize();
 
     for (auto _ : state) {
         auto packed = cudf::pack(table.view(), stream, pack_mr);
+        RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+            dest.data(),
+            packed.gpu_data->data(),
+            packed.gpu_data->size(),
+            cudaMemcpyDefault,
+            stream
+        ));
         benchmark::DoNotOptimize(packed);
+        benchmark::DoNotOptimize(dest);
         stream.synchronize();
     }
 
@@ -68,7 +81,7 @@ void run_pack(
 /**
  * @brief Benchmark for cudf::pack with device memory
  */
-static void BM_Pack_device(benchmark::State& state) {
+static void BM_Pack_device_copy_to_host(benchmark::State& state) {
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
@@ -78,13 +91,31 @@ static void BM_Pack_device(benchmark::State& state) {
     rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
         cuda_mr, rmm::percent_of_free_device_memory(40)
     };
-    run_pack(state, table_size_mb, pool_mr, pool_mr, stream);
+    rapidsmpf::HostMemoryResource host_mr;
+    run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, host_mr, stream);
+}
+
+/**
+ * @brief Benchmark for cudf::pack with device memory
+ */
+static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    // Create memory resources
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+    run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, pinned_mr, stream);
 }
 
 /**
  * @brief Benchmark for cudf::pack with pinned memory
  */
-static void BM_Pack_pinned(benchmark::State& state) {
+static void BM_Pack_pinned_copy_to_host(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
         return;
@@ -100,8 +131,32 @@ static void BM_Pack_pinned(benchmark::State& state) {
         cuda_mr, rmm::percent_of_free_device_memory(40)
     };
     rapidsmpf::PinnedMemoryResource pinned_mr;
+    rapidsmpf::HostMemoryResource dest_mr;
 
-    run_pack(state, table_size_mb, pool_mr, pinned_mr, stream);
+    run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, dest_mr, stream);
+}
+
+/**
+ * @brief Benchmark for cudf::pack with pinned memory
+ */
+static void BM_Pack_pinned_copy_to_pinned_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    // Create memory resources
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, pinned_mr, stream);
 }
 
 /**
@@ -111,14 +166,19 @@ static void BM_Pack_pinned(benchmark::State& state) {
  * @param table_size The size of the table in bytes
  * @param table_mr The memory resource for the table
  * @param pack_mr The memory resource for the packed data
+ * @param dest_mr The memory resource for the destination data
  * @param stream The CUDA stream to use
+ *
+ * @tparam DestinationBufferType The type of the destination buffer
  */
+template <typename DestinationBufferType = rapidsmpf::HostBuffer>
 void run_chunked_pack(
     benchmark::State& state,
     std::size_t bounce_buffer_size,
     std::size_t table_size,
     rmm::device_async_resource_ref table_mr,
     rmm::device_async_resource_ref pack_mr,
+    auto& dest_mr,
     rmm::cuda_stream_view stream
 ) {
     // Calculate number of rows for a single-column table of the desired size
@@ -134,7 +194,7 @@ void run_chunked_pack(
 
     // Allocate bounce buffer and destination buffer using the pack_mr
     rmm::device_buffer bounce_buffer(bounce_buffer_size, stream, pack_mr);
-    rmm::device_buffer destination(total_size, stream, pack_mr);
+    DestinationBufferType destination(total_size, stream, dest_mr);
 
     auto run_packer = [&] {
         cudf::chunked_pack packer(table.view(), bounce_buffer_size, stream, pack_mr);
@@ -147,7 +207,7 @@ void run_chunked_pack(
                 )
             );
             RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                static_cast<std::uint8_t*>(destination.data()) + offset,
+                reinterpret_cast<std::uint8_t*>(destination.data()) + offset,
                 bounce_buffer.data(),
                 bytes_copied,
                 cudaMemcpyDefault,
@@ -182,7 +242,7 @@ void run_chunked_pack(
 /**
  * @brief Benchmark for cudf::chunked_pack with device memory
  */
-static void BM_ChunkedPack_device(benchmark::State& state) {
+static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) {
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
     auto const table_size_bytes = table_size_mb * MB;
 
@@ -195,21 +255,70 @@ static void BM_ChunkedPack_device(benchmark::State& state) {
     rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
         cuda_mr, rmm::percent_of_free_device_memory(40)
     };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
 
     run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, stream
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream
     );
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack pinned memory
+ * @brief Benchmark for cudf::chunked_pack with device memory, copying to host memory.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
  */
-static void BM_ChunkedPack_pinned(benchmark::State& state) {
-    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
-        state.SkipWithMessage("Pinned memory resources are not supported");
-        return;
-    }
+static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::HostMemoryResource host_mr;
+
+    run_chunked_pack(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, host_mr, stream
+    );
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to pinned host
+ * memory.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_chunked_pack(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream
+    );
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to host memory.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) {
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
     auto const table_size_bytes = table_size_mb * MB;
 
@@ -223,13 +332,20 @@ static void BM_ChunkedPack_pinned(benchmark::State& state) {
         cuda_mr, rmm::percent_of_free_device_memory(40)
     };
     rapidsmpf::PinnedMemoryResource pinned_mr;
+    rapidsmpf::HostMemoryResource host_mr;
 
     run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, host_mr, stream
     );
 }
 
-// Custom argument generator for the benchmark
+/**
+ * @brief Custom argument generator for pack benchmarks.
+ *
+ * Configures benchmarks to run with various table sizes ranging from 1MB to 4GB.
+ *
+ * @param b The benchmark to configure with arguments.
+ */
 void PackArguments(benchmark::internal::Benchmark* b) {
     // Test different table sizes in MB (minimum 1MB as requested)
     for (auto size_mb : {1, 10, 100, 500, 1000, 2000, 4000}) {
@@ -238,21 +354,45 @@ void PackArguments(benchmark::internal::Benchmark* b) {
 }
 
 // Register the benchmarks
-BENCHMARK(BM_Pack_device)
+
+// Pack benchmarks
+BENCHMARK(BM_Pack_device_copy_to_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_Pack_device_copy_to_pinned_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_Pack_pinned_copy_to_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_Pack_pinned_copy_to_pinned_host)
     ->Apply(PackArguments)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Pack_pinned)
+
+// Chunked pack benchmarks
+BENCHMARK(BM_ChunkedPack_device_copy_to_pinned_host)
     ->Apply(PackArguments)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
-BENCHMARK(BM_ChunkedPack_device)
+BENCHMARK(BM_ChunkedPack_device_copy_to_host)
     ->Apply(PackArguments)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
-BENCHMARK(BM_ChunkedPack_pinned)
+BENCHMARK(BM_ChunkedPack_pinned_copy_to_pinned_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_pinned_copy_to_host)
     ->Apply(PackArguments)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
@@ -274,8 +414,8 @@ static void BM_ChunkedPack_fixed_table_device(benchmark::State& state) {
         cuda_mr, rmm::percent_of_free_device_memory(40)
     };
 
-    run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, stream
+    run_chunked_pack<rmm::device_buffer>(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream
     );
 }
 
@@ -300,14 +440,20 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) {
     };
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
-    run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream
+    run_chunked_pack<rmm::device_buffer>(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream
     );
 }
 
-// Custom argument generator for the benchmark
+/**
+ * @brief Custom argument generator for chunked pack benchmarks with fixed table size.
+ *
+ * Configures benchmarks to run with various bounce buffer sizes ranging from 1MB to 1GB.
+ *
+ * @param b The benchmark to configure with arguments.
+ */
 void ChunkedPackArguments(benchmark::internal::Benchmark* b) {
-    // Test different table sizes in MB (minimum 1MB as requested)
+    // Test different bounce buffer sizes in MB
     for (auto bounce_buf_sz_mb : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}) {
         b->Args({bounce_buf_sz_mb});
     }

From dea025b9c3b412cee386f9d889cc0bb45eaa4bdd Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 13 Jan 2026 14:20:48 -0800
Subject: [PATCH 03/10] more cases

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 153 +++++++++++++++++++++++++++++++---
 1 file changed, 143 insertions(+), 10 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index 4c136fc1c..37f78b581 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -28,7 +28,80 @@
 constexpr std::size_t MB = 1024 * 1024;
 
 /**
- * @brief Runs the cudf::pack benchmark
+ * @brief Runs the cudf::pack using a device-accessible memory resource
+ * @param state The benchmark state
+ * @param table_size_mb The size of the table in MB
+ * @param table_mr The memory resource for the table
+ * @param pack_mr The memory resource for the packed data
+ * @param stream The CUDA stream to use
+ */
+void run_pack(
+    benchmark::State& state,
+    std::size_t table_size_mb,
+    rmm::device_async_resource_ref table_mr,
+    rmm::device_async_resource_ref pack_mr,
+    rmm::cuda_stream_view stream
+) {
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Calculate number of rows for a single-column table of the desired size
+    auto const nrows =
+        static_cast<cudf::size_type>(table_size_bytes / sizeof(random_data_t));
+    auto table = random_table(1, nrows, 0, 1000, stream, table_mr);
+
+    // Warm up
+    auto warm_up = cudf::pack(table.view(), stream, pack_mr);
+    stream.synchronize();
+
+    for (auto _ : state) {
+        auto packed = cudf::pack(table.view(), stream, pack_mr);
+        benchmark::DoNotOptimize(packed);
+        stream.synchronize();
+    }
+
+    state.SetBytesProcessed(
+        static_cast<std::int64_t>(state.iterations())
+        * static_cast<std::int64_t>(table_size_bytes)
+    );
+    state.counters["table_size_mb"] = static_cast<double>(table_size_mb);
+    state.counters["num_rows"] = nrows;
+}
+
+/**
+ * @brief Benchmark for cudf::pack with device memory
+ */
+static void BM_Pack_device(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    // Create memory resources
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    run_pack(state, table_size_mb, pool_mr, pool_mr, stream);
+}
+
+/**
+ * @brief Benchmark for cudf::pack with device memory
+ */
+static void BM_Pack_pinned_host(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    // Create memory resources
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+    run_pack(state, table_size_mb, pool_mr, pinned_mr, stream);
+}
+
+/**
+ * @brief Runs the cudf::pack and copy the packed data to a host buffer
  * @param state The benchmark state
  * @param table_size_mb The size of the table in MB
  * @param table_mr The memory resource for the table
@@ -99,6 +172,11 @@ static void BM_Pack_device_copy_to_host(benchmark::State& state) {
  * @brief Benchmark for cudf::pack with device memory
  */
 static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
@@ -207,7 +285,7 @@ void run_chunked_pack(
                 )
             );
             RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
-                reinterpret_cast<std::uint8_t*>(destination.data()) + offset,
+                static_cast<std::byte*>(destination.data()) + offset,
                 bounce_buffer.data(),
                 bytes_copied,
                 cudaMemcpyDefault,
@@ -217,7 +295,7 @@ void run_chunked_pack(
         }
     };
 
-    {
+    {  // Warm up
         run_packer();
         stream.synchronize();
     }
@@ -240,9 +318,40 @@ void run_chunked_pack(
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with device memory
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
+ * buffer.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_device_copy_to_device(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+
+    run_chunked_pack<rmm::device_buffer>(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream
+    );
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and pinned host
+ * destination buffer.
  */
 static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
     auto const table_size_bytes = table_size_mb * MB;
 
@@ -263,7 +372,8 @@ static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with device memory, copying to host memory.
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and host destination
+ * buffer.
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
@@ -288,12 +398,17 @@ static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to pinned host
- * memory.
+ * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer and pinned host
+ * destination buffer. memory.
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
 static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
     auto const table_size_bytes = table_size_mb * MB;
 
@@ -314,11 +429,17 @@ static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to host memory.
+ * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer and host destination
+ * buffer.
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
 static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
     auto const table_size_bytes = table_size_mb * MB;
 
@@ -356,6 +477,17 @@ void PackArguments(benchmark::internal::Benchmark* b) {
 // Register the benchmarks
 
 // Pack benchmarks
+BENCHMARK(BM_Pack_device)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_Pack_pinned_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+// Pack and copy benchmarks
 BENCHMARK(BM_Pack_device_copy_to_host)
     ->Apply(PackArguments)
     ->UseRealTime()
@@ -371,12 +503,13 @@ BENCHMARK(BM_Pack_pinned_copy_to_host)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
-BENCHMARK(BM_Pack_pinned_copy_to_pinned_host)
+// Chunked pack benchmarks
+
+BENCHMARK(BM_ChunkedPack_device_copy_to_device)
     ->Apply(PackArguments)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
-// Chunked pack benchmarks
 BENCHMARK(BM_ChunkedPack_device_copy_to_pinned_host)
     ->Apply(PackArguments)
     ->UseRealTime()

From d4859968270ec0a62ac764fed0de7cd26f78c942 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 13 Jan 2026 14:26:23 -0800
Subject: [PATCH 04/10] remvoing case

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index 37f78b581..28f054ebf 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -214,29 +214,6 @@ static void BM_Pack_pinned_copy_to_host(benchmark::State& state) {
     run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, dest_mr, stream);
 }
 
-/**
- * @brief Benchmark for cudf::pack with pinned memory
- */
-static void BM_Pack_pinned_copy_to_pinned_host(benchmark::State& state) {
-    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
-        state.SkipWithMessage("Pinned memory resources are not supported");
-        return;
-    }
-
-    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
-
-    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
-    // Create memory resources
-    rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
-    rapidsmpf::PinnedMemoryResource pinned_mr;
-
-    run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, pinned_mr, stream);
-}
-
 /**
  * @brief Runs the cudf::chunked_pack benchmark
  * @param state The benchmark state

From fe1b08030a1b9419ce8640e4e38570d9f1540cdc Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 13 Jan 2026 15:27:05 -0800
Subject: [PATCH 05/10] bypass pinned

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index 28f054ebf..80f36b7da 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -87,6 +87,11 @@ static void BM_Pack_device(benchmark::State& state) {
  * @brief Benchmark for cudf::pack with device memory
  */
 static void BM_Pack_pinned_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;

From 44db8f74a2fa75ae918e12e225c527f92d135ce2 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 14 Jan 2026 10:44:47 -0800
Subject: [PATCH 06/10] chunked pack without bounce buffer

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 150 ++++++++++++++++++++++++++++++++--
 1 file changed, 144 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index 80f36b7da..b7025ea04 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -252,7 +252,7 @@ void run_chunked_pack(
         total_size = packer.get_total_contiguous_size();
     }
 
-    // Allocate bounce buffer and destination buffer using the pack_mr
+    // Allocate bounce buffer using the pack_mr & destination buffer using the dest_mr
     rmm::device_buffer bounce_buffer(bounce_buffer_size, stream, pack_mr);
     DestinationBufferType destination(total_size, stream, dest_mr);
 
@@ -261,11 +261,9 @@ void run_chunked_pack(
 
         std::size_t offset = 0;
         while (packer.has_next()) {
-            auto const bytes_copied = packer.next(
-                cudf::device_span<std::uint8_t>(
-                    static_cast<std::uint8_t*>(bounce_buffer.data()), bounce_buffer_size
-                )
-            );
+            auto const bytes_copied = packer.next(cudf::device_span<std::uint8_t>(
+                static_cast<std::uint8_t*>(bounce_buffer.data()), bounce_buffer_size
+            ));
             RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
                 static_cast<std::byte*>(destination.data()) + offset,
                 bounce_buffer.data(),
@@ -442,6 +440,136 @@ static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) {
     );
 }
 
+/**
+ * @brief Runs the cudf::chunked_pack benchmark
+ * @param state The benchmark state
+ * @param bounce_buffer_size The size of the bounce buffer in bytes
+ * @param table_size The size of the table in bytes
+ * @param table_mr The memory resource for the table
+ * @param pack_mr The memory resource for the packed data
+ * @param dest_mr The memory resource for the destination data
+ * @param stream The CUDA stream to use
+ *
+ * @tparam DestinationBufferType The type of the destination buffer
+ */
+template <typename DestinationBufferType = rapidsmpf::HostBuffer>
+void run_chunked_pack_without_bounce_buffer(
+    benchmark::State& state,
+    std::size_t bounce_buffer_size,
+    std::size_t table_size,
+    rmm::device_async_resource_ref table_mr,
+    rmm::device_async_resource_ref pack_mr,
+    auto& dest_mr,
+    rmm::cuda_stream_view stream
+) {
+    // Calculate number of rows for a single-column table of the desired size
+    auto const nrows = static_cast<cudf::size_type>(table_size / sizeof(random_data_t));
+    auto table = random_table(1, nrows, 0, 1000, stream, table_mr);
+
+    // Create the chunked_pack instance to get total output size
+    size_t total_size;
+    {
+        cudf::chunked_pack packer(table.view(), bounce_buffer_size, stream, table_mr);
+        // upper bound multiple of bounce buffer size
+        total_size = ((packer.get_total_contiguous_size() + bounce_buffer_size - 1)
+                      / bounce_buffer_size)
+                     * bounce_buffer_size;
+    }
+
+    // Allocate the destination buffer
+    DestinationBufferType destination(total_size, stream, dest_mr);
+
+    auto run_packer = [&] {
+        cudf::chunked_pack packer(table.view(), bounce_buffer_size, stream, pack_mr);
+
+        std::size_t offset = 0;
+        while (packer.has_next()) {
+            auto const bytes_copied = packer.next(cudf::device_span<std::uint8_t>(
+                reinterpret_cast<std::uint8_t*>(destination.data()) + offset,
+                bounce_buffer_size
+            ));
+            offset += bytes_copied;
+        }
+    };
+
+    {  // Warm up
+        run_packer();
+        stream.synchronize();
+    }
+
+    for (auto _ : state) {
+        run_packer();
+        benchmark::DoNotOptimize(destination);
+        stream.synchronize();
+    }
+
+    state.SetBytesProcessed(
+        static_cast<std::int64_t>(state.iterations())
+        * static_cast<std::int64_t>(table_size)
+    );
+    state.counters["table_size_mb"] =
+        static_cast<double>(table_size) / static_cast<double>(MB);
+    state.counters["num_rows"] = nrows;
+    state.counters["bounce_buffer_mb"] =
+        static_cast<double>(bounce_buffer_size) / static_cast<double>(MB);
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
+ * buffer.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_device(benchmark::State& state) {
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+
+    run_chunked_pack_without_bounce_buffer<rmm::device_buffer>(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream
+    );
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
+ * buffer.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_pinned(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_chunked_pack_without_bounce_buffer<rapidsmpf::HostBuffer>(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream
+    );
+}
+
 /**
  * @brief Custom argument generator for pack benchmarks.
  *
@@ -487,6 +615,16 @@ BENCHMARK(BM_Pack_pinned_copy_to_host)
 
 // Chunked pack benchmarks
 
+BENCHMARK(BM_ChunkedPack_device)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_pinned)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
 BENCHMARK(BM_ChunkedPack_device_copy_to_device)
     ->Apply(PackArguments)
     ->UseRealTime()

From 73b38a6cf00eb06563c0453529f5d8e9578adee2 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 14 Jan 2026 11:55:25 -0800
Subject: [PATCH 07/10] another bench

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 40 +++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index b7025ea04..c994ffbe8 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -545,7 +545,7 @@ static void BM_ChunkedPack_device(benchmark::State& state) {
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
-static void BM_ChunkedPack_pinned(benchmark::State& state) {
+static void BM_ChunkedPack_pinned_device_mr(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
         return;
@@ -570,6 +570,37 @@ static void BM_ChunkedPack_pinned(benchmark::State& state) {
     );
 }
 
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
+ * buffer.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_pinned_pinned_mr(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
+        cuda_mr, rmm::percent_of_free_device_memory(40)
+    };
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_chunked_pack_without_bounce_buffer<rapidsmpf::HostBuffer>(
+        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream
+    );
+}
+
 /**
  * @brief Custom argument generator for pack benchmarks.
  *
@@ -620,7 +651,12 @@ BENCHMARK(BM_ChunkedPack_device)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
-BENCHMARK(BM_ChunkedPack_pinned)
+BENCHMARK(BM_ChunkedPack_pinned_device_mr)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_pinned_pinned_mr)
     ->Apply(PackArguments)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);

From 47a07171d150dc3551de97568ef3d48e5e91c628 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 15 Jan 2026 10:31:05 -0800
Subject: [PATCH 08/10] addressing PR comments and fixed sized cases

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 384 ++++++++++++++++++++++++++--------
 1 file changed, 302 insertions(+), 82 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index c994ffbe8..2a356ba32 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -9,6 +9,8 @@
 
 #include <benchmark/benchmark.h>
 
+#include <cuda/memory_resource>
+
 #include <cudf/contiguous_split.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -17,7 +19,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/cuda_async_memory_resource.hpp>
-#include <rmm/mr/pool_memory_resource.hpp>
 
 #include <rapidsmpf/memory/host_buffer.hpp>
 #include <rapidsmpf/memory/host_memory_resource.hpp>
@@ -75,12 +76,9 @@ static void BM_Pack_device(benchmark::State& state) {
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-    // Create memory resources
+
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
-    run_pack(state, table_size_mb, pool_mr, pool_mr, stream);
+    run_pack(state, table_size_mb, cuda_mr, cuda_mr, stream);
 }
 
 /**
@@ -96,13 +94,10 @@ static void BM_Pack_pinned_host(benchmark::State& state) {
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-    // Create memory resources
+
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
-    run_pack(state, table_size_mb, pool_mr, pinned_mr, stream);
+    run_pack(state, table_size_mb, cuda_mr, pinned_mr, stream);
 }
 
 /**
@@ -132,6 +127,13 @@ void run_pack_and_copy(
     auto warm_up = cudf::pack(table.view(), stream, pack_mr);
 
     rapidsmpf::HostBuffer dest(warm_up.gpu_data->size(), stream, dest_mr);
+    RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+        dest.data(),
+        warm_up.gpu_data->data(),
+        warm_up.gpu_data->size(),
+        cudaMemcpyDefault,
+        stream
+    ));
     stream.synchronize();
 
     for (auto _ : state) {
@@ -164,13 +166,10 @@ static void BM_Pack_device_copy_to_host(benchmark::State& state) {
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-    // Create memory resources
+
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::HostMemoryResource host_mr;
-    run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, host_mr, stream);
+    run_pack_and_copy(state, table_size_mb, cuda_mr, cuda_mr, host_mr, stream);
 }
 
 /**
@@ -186,13 +185,9 @@ static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) {
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-    // Create memory resources
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
-    run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, pinned_mr, stream);
+    run_pack_and_copy(state, table_size_mb, cuda_mr, cuda_mr, pinned_mr, stream);
 }
 
 /**
@@ -208,15 +203,11 @@ static void BM_Pack_pinned_copy_to_host(benchmark::State& state) {
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-    // Create memory resources
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
     rapidsmpf::HostMemoryResource dest_mr;
 
-    run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, dest_mr, stream);
+    run_pack_and_copy(state, table_size_mb, cuda_mr, pinned_mr, dest_mr, stream);
 }
 
 /**
@@ -261,9 +252,11 @@ void run_chunked_pack(
 
         std::size_t offset = 0;
         while (packer.has_next()) {
-            auto const bytes_copied = packer.next(cudf::device_span<std::uint8_t>(
-                static_cast<std::uint8_t*>(bounce_buffer.data()), bounce_buffer_size
-            ));
+            auto const bytes_copied = packer.next(
+                cudf::device_span<std::uint8_t>(
+                    static_cast<std::uint8_t*>(bounce_buffer.data()), bounce_buffer_size
+                )
+            );
             RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
                 static_cast<std::byte*>(destination.data()) + offset,
                 bounce_buffer.data(),
@@ -313,12 +306,9 @@ static void BM_ChunkedPack_device_copy_to_device(benchmark::State& state) {
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
 
     run_chunked_pack<rmm::device_buffer>(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream
     );
 }
 
@@ -341,13 +331,10 @@ static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) {
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
     run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, pinned_mr, stream
     );
 }
 
@@ -365,15 +352,11 @@ static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) {
     auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::HostMemoryResource host_mr;
 
     run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, host_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, host_mr, stream
     );
 }
 
@@ -396,15 +379,11 @@ static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) {
     auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
     run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream
     );
 }
 
@@ -427,16 +406,12 @@ static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) {
     auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
     rapidsmpf::HostMemoryResource host_mr;
 
     run_chunked_pack(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, host_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, host_mr, stream
     );
 }
 
@@ -484,10 +459,12 @@ void run_chunked_pack_without_bounce_buffer(
 
         std::size_t offset = 0;
         while (packer.has_next()) {
-            auto const bytes_copied = packer.next(cudf::device_span<std::uint8_t>(
-                reinterpret_cast<std::uint8_t*>(destination.data()) + offset,
-                bounce_buffer_size
-            ));
+            auto const bytes_copied = packer.next(
+                cudf::device_span<std::uint8_t>(
+                    reinterpret_cast<std::uint8_t*>(destination.data()) + offset,
+                    bounce_buffer_size
+                )
+            );
             offset += bytes_copied;
         }
     };
@@ -528,14 +505,10 @@ static void BM_ChunkedPack_device(benchmark::State& state) {
     auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
 
     run_chunked_pack_without_bounce_buffer<rmm::device_buffer>(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream
     );
 }
 
@@ -558,15 +531,11 @@ static void BM_ChunkedPack_pinned_device_mr(benchmark::State& state) {
     auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
     run_chunked_pack_without_bounce_buffer<rapidsmpf::HostBuffer>(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, pinned_mr, stream
     );
 }
 
@@ -589,15 +558,254 @@ static void BM_ChunkedPack_pinned_pinned_mr(benchmark::State& state) {
     auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_chunked_pack_without_bounce_buffer<rapidsmpf::HostBuffer>(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream
+    );
+}
+
+/**
+ * @brief Runs the cudf::chunked_pack benchmark
+ * @param state The benchmark state
+ * @param bounce_buffer_size The size of the bounce buffer in bytes
+ * @param table_size The size of the table in bytes
+ * @param table_mr The memory resource for the table
+ * @param pack_mr The memory resource for the packed data
+ * @param dest_mr The memory resource for the destination data
+ * @param stream The CUDA stream to use
+ *
+ * @tparam DestinationBufferType The type of the destination buffer
+ */
+void run_chunked_pack_with_fixed_sized_host_buffers(
+    benchmark::State& state,
+    std::size_t fixed_buffer_size,
+    std::size_t table_size,
+    rmm::device_async_resource_ref table_mr,
+    rmm::device_async_resource_ref pack_mr,
+    rmm::host_async_resource_ref host_mr,
+    rmm::cuda_stream_view stream
+) {
+    // Calculate number of rows for a single-column table of the desired size
+    auto const nrows = static_cast<cudf::size_type>(table_size / sizeof(random_data_t));
+    auto table = random_table(1, nrows, 0, 1000, stream, table_mr);
+
+    // Create the chunked_pack instance to get total output size
+    size_t n_buffers;
+    {
+        cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, table_mr);
+        // upper bound multiple of bounce buffer size
+        n_buffers = (packer.get_total_contiguous_size() + fixed_buffer_size - 1)
+                    / fixed_buffer_size;
+    }
+
+    // Allocate fixed sized host buffers for the destination
+    std::vector<rapidsmpf::HostBuffer> fixed_host_buffers;
+    for (size_t i = 0; i < n_buffers; i++) {
+        fixed_host_buffers.emplace_back(fixed_buffer_size, stream, host_mr);
+    }
+
+    auto run_packer = [&] {
+        cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, pack_mr);
+
+        std::size_t buffer_idx = 0;
+        while (packer.has_next()) {
+            std::ignore = packer.next(
+                cudf::device_span<std::uint8_t>(
+                    reinterpret_cast<std::uint8_t*>(
+                        fixed_host_buffers[buffer_idx].data()
+                    ),
+                    fixed_buffer_size
+                )
+            );
+            buffer_idx++;
+        }
+    };
 
+    {  // Warm up
+        run_packer();
+        stream.synchronize();
+    }
+
+    for (auto _ : state) {
+        run_packer();
+        benchmark::DoNotOptimize(fixed_host_buffers);
+        stream.synchronize();
+    }
+
+    state.SetBytesProcessed(
+        static_cast<std::int64_t>(state.iterations())
+        * static_cast<std::int64_t>(table_size)
+    );
+    state.counters["table_size_mb"] =
+        static_cast<double>(table_size) / static_cast<double>(MB);
+    state.counters["num_rows"] = nrows;
+    state.counters["bounce_buffer_mb"] = 0;
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
+ * buffer.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
+static void BM_ChunkedPack_pinned_to_fixed_sized_pinned(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_chunked_pack_with_fixed_sized_host_buffers(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream
+    );
+}
+
+void run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer(
+    benchmark::State& state,
+    std::size_t fixed_buffer_size,
+    std::size_t table_size,
+    rmm::device_async_resource_ref table_mr,
+    rmm::device_async_resource_ref pack_mr,
+    rmm::host_async_resource_ref host_mr,
+    rmm::cuda_stream_view stream
+) {
+    // Calculate number of rows for a single-column table of the desired size
+    auto const nrows = static_cast<cudf::size_type>(table_size / sizeof(random_data_t));
+    auto table = random_table(1, nrows, 0, 1000, stream, table_mr);
+
+    // Create the chunked_pack instance to get total output size
+    size_t n_buffers;
+    {
+        cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, table_mr);
+        // upper bound multiple of bounce buffer size
+        n_buffers = (packer.get_total_contiguous_size() + fixed_buffer_size - 1)
+                    / fixed_buffer_size;
+    }
+
+    // Allocate fixed sized host buffers for the destination
+    std::vector<rapidsmpf::HostBuffer> fixed_host_buffers;
+    for (size_t i = 0; i < n_buffers; i++) {
+        fixed_host_buffers.emplace_back(fixed_buffer_size, stream, host_mr);
+    }
+
+    rmm::device_buffer bounce_buffer(fixed_buffer_size, stream, pack_mr);
+
+    auto run_packer = [&] {
+        cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, pack_mr);
+
+        std::size_t buffer_idx = 0;
+        while (packer.has_next()) {
+            auto const bytes_copied = packer.next(
+                cudf::device_span<std::uint8_t>(
+                    static_cast<std::uint8_t*>(bounce_buffer.data()), fixed_buffer_size
+                )
+            );
+            RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync(
+                static_cast<std::byte*>(fixed_host_buffers[buffer_idx].data()),
+                bounce_buffer.data(),
+                bytes_copied,
+                cudaMemcpyDefault,
+                stream.value()
+            ));
+            buffer_idx++;
+        }
     };
+
+    {  // Warm up
+        run_packer();
+        stream.synchronize();
+    }
+
+    for (auto _ : state) {
+        run_packer();
+        benchmark::DoNotOptimize(bounce_buffer);
+        benchmark::DoNotOptimize(fixed_host_buffers);
+        stream.synchronize();
+    }
+
+    state.SetBytesProcessed(
+        static_cast<std::int64_t>(state.iterations())
+        * static_cast<std::int64_t>(table_size)
+    );
+    state.counters["table_size_mb"] =
+        static_cast<double>(table_size) / static_cast<double>(MB);
+    state.counters["num_rows"] = nrows;
+    state.counters["bounce_buffer_mb"] =
+        static_cast<double>(bounce_buffer.size()) / static_cast<double>(MB);
+}
+
+static void BM_ChunkedPack_device_to_fixed_sized_pinned(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+    rmm::mr::cuda_async_memory_resource cuda_mr;
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
-    run_chunked_pack_without_bounce_buffer<rapidsmpf::HostBuffer>(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream
+    run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, pinned_mr, stream
+    );
+}
+
+static void BM_ChunkedPack_device_to_fixed_sized_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rapidsmpf::HostMemoryResource host_mr;
+
+    run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, host_mr, stream
+    );
+}
+
+static void BM_ChunkedPack_pinned_to_fixed_sized_host(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const table_size_mb = static_cast<std::size_t>(state.range(0));
+    auto const table_size_bytes = table_size_mb * MB;
+
+    // Bounce buffer size: max(1MB, table_size / 10)
+    auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10);
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+    rapidsmpf::HostMemoryResource host_mr;
+
+    run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, host_mr, stream
     );
 }
 
@@ -686,6 +894,28 @@ BENCHMARK(BM_ChunkedPack_pinned_copy_to_host)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
+// Chunked pack with fixed sized host buffers and bounce buffer benchmarks
+
+BENCHMARK(BM_ChunkedPack_device_to_fixed_sized_pinned)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_device_to_fixed_sized_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_pinned_to_fixed_sized_pinned)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_pinned_to_fixed_sized_host)
+    ->Apply(PackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
 /**
  * @brief Benchmark for cudf::chunked_pack in device memory varying the bounce buffer size
  * and keeping table size fixed at 1GB
@@ -695,16 +925,10 @@ static void BM_ChunkedPack_fixed_table_device(benchmark::State& state) {
     constexpr std::size_t table_size_bytes = 1024 * MB;
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
-    // Create memory resources
     rmm::mr::cuda_async_memory_resource cuda_mr;
 
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
-
     run_chunked_pack<rmm::device_buffer>(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream
     );
 }
 
@@ -722,15 +946,11 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) {
     constexpr std::size_t table_size_bytes = 1024 * MB;
 
     rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
     rmm::mr::cuda_async_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource> pool_mr{
-        cuda_mr, rmm::percent_of_free_device_memory(40)
-    };
     rapidsmpf::PinnedMemoryResource pinned_mr;
 
     run_chunked_pack<rmm::device_buffer>(
-        state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream
     );
 }
 

From 7126e01847851e51acbd46b9494e9e694579ae55 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 20 Jan 2026 12:47:36 -0800
Subject: [PATCH 09/10] extending benchmarks

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 93 ++++++++++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 13 deletions(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index 2a356ba32..2b7c4a9e3 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -66,6 +66,7 @@ void run_pack(
     );
     state.counters["table_size_mb"] = static_cast<double>(table_size_mb);
     state.counters["num_rows"] = nrows;
+    state.counters["bounce_buffer_mb"] = 0;
 }
 
 /**
@@ -82,7 +83,7 @@ static void BM_Pack_device(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::pack with device memory
+ * @brief Benchmark for cudf::pack with pinned host memory as the pack destination.
  */
 static void BM_Pack_pinned_host(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
@@ -156,10 +157,11 @@ void run_pack_and_copy(
     );
     state.counters["table_size_mb"] = static_cast<double>(table_size_mb);
     state.counters["num_rows"] = nrows;
+    state.counters["bounce_buffer_mb"] = 0;
 }
 
 /**
- * @brief Benchmark for cudf::pack with device memory
+ * @brief Benchmark for cudf::pack with device memory and copy to host memory.
  */
 static void BM_Pack_device_copy_to_host(benchmark::State& state) {
     auto const table_size_mb = static_cast<std::size_t>(state.range(0));
@@ -173,7 +175,7 @@ static void BM_Pack_device_copy_to_host(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::pack with device memory
+ * @brief Benchmark for cudf::pack with device memory and copy to pinned host memory.
  */
 static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
@@ -191,7 +193,7 @@ static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::pack with pinned memory
+ * @brief Benchmark for cudf::pack with pinned memory and copy to host memory.
  */
 static void BM_Pack_pinned_copy_to_host(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
@@ -362,7 +364,7 @@ static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) {
 
 /**
  * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer and pinned host
- * destination buffer. memory.
+ * destination buffer.
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
@@ -492,8 +494,7 @@ void run_chunked_pack_without_bounce_buffer(
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
- * buffer.
+ * @brief Benchmark for cudf::chunked_pack directly into device memory (no bounce buffer).
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
@@ -513,8 +514,8 @@ static void BM_ChunkedPack_device(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
- * buffer.
+ * @brief Benchmark for cudf::chunked_pack directly into pinned host memory with device
+ * pack memory resource (no bounce buffer).
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
@@ -540,8 +541,8 @@ static void BM_ChunkedPack_pinned_device_mr(benchmark::State& state) {
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
- * buffer.
+ * @brief Benchmark for cudf::chunked_pack directly into pinned host memory with pinned
+ * pack memory resource (no bounce buffer).
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
@@ -645,8 +646,8 @@ void run_chunked_pack_with_fixed_sized_host_buffers(
 }
 
 /**
- * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination
- * buffer.
+ * @brief Benchmark for cudf::chunked_pack directly into fixed-sized pinned host buffers
+ * with pinned pack memory resource (no bounce buffer).
  * @param state The benchmark state containing the table size in MB as the first range
  * argument.
  */
@@ -745,6 +746,12 @@ void run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer(
         static_cast<double>(bounce_buffer.size()) / static_cast<double>(MB);
 }
 
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer copying to
+ * fixed-sized pinned host buffers.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
 static void BM_ChunkedPack_device_to_fixed_sized_pinned(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
@@ -766,6 +773,12 @@ static void BM_ChunkedPack_device_to_fixed_sized_pinned(benchmark::State& state)
     );
 }
 
+/**
+ * @brief Benchmark for cudf::chunked_pack with device bounce buffer copying to
+ * fixed-sized host buffers.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
 static void BM_ChunkedPack_device_to_fixed_sized_host(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
@@ -787,6 +800,12 @@ static void BM_ChunkedPack_device_to_fixed_sized_host(benchmark::State& state) {
     );
 }
 
+/**
+ * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer copying to
+ * fixed-sized host buffers.
+ * @param state The benchmark state containing the table size in MB as the first range
+ * argument.
+ */
 static void BM_ChunkedPack_pinned_to_fixed_sized_host(benchmark::State& state) {
     if (!rapidsmpf::is_pinned_memory_resources_supported()) {
         state.SkipWithMessage("Pinned memory resources are not supported");
@@ -954,6 +973,44 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) {
     );
 }
 
+/**
+ * @brief Benchmark for cudf::chunked_pack in device memory varying the bounce buffer size
+ * and keeping table size fixed at 1GB
+ */
+static void BM_ChunkedPack_fixed_table_device_no_bounce_buffer(benchmark::State& state) {
+    auto const bounce_buffer_size = static_cast<std::size_t>(state.range(0)) * MB;
+    constexpr std::size_t table_size_bytes = 1024 * MB;
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+
+    run_chunked_pack_without_bounce_buffer<rmm::device_buffer>(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream
+    );
+}
+
+/**
+ * @brief Benchmark for cudf::chunked_pack in pinned memory varying the bounce buffer size
+ * and keeping table size fixed at 1GB
+ */
+static void BM_ChunkedPack_fixed_table_pinned_no_bounce_buffer(benchmark::State& state) {
+    if (!rapidsmpf::is_pinned_memory_resources_supported()) {
+        state.SkipWithMessage("Pinned memory resources are not supported");
+        return;
+    }
+
+    auto const bounce_buffer_size = static_cast<std::size_t>(state.range(0)) * MB;
+    constexpr std::size_t table_size_bytes = 1024 * MB;
+
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+    rmm::mr::cuda_async_memory_resource cuda_mr;
+    rapidsmpf::PinnedMemoryResource pinned_mr;
+
+    run_chunked_pack_without_bounce_buffer<rmm::device_buffer>(
+        state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream
+    );
+}
+
 /**
  * @brief Custom argument generator for chunked pack benchmarks with fixed table size.
  *
@@ -978,4 +1035,14 @@ BENCHMARK(BM_ChunkedPack_fixed_table_pinned)
     ->UseRealTime()
     ->Unit(benchmark::kMillisecond);
 
+BENCHMARK(BM_ChunkedPack_fixed_table_device_no_bounce_buffer)
+    ->Apply(ChunkedPackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_ChunkedPack_fixed_table_pinned_no_bounce_buffer)
+    ->Apply(ChunkedPackArguments)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
 BENCHMARK_MAIN();

From 2fc4c36dd0bd84d7b9ee48297aed0a56b84a3350 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 20 Jan 2026 12:54:29 -0800
Subject: [PATCH 10/10] precommit

Signed-off-by: niranda perera <niranda.perera@gmail.com>
---
 cpp/benchmarks/bench_pack.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp
index a3817c281..3bd66ecc8 100644
--- a/cpp/benchmarks/bench_pack.cpp
+++ b/cpp/benchmarks/bench_pack.cpp
@@ -1,5 +1,5 @@
 /**
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */