From 3b842c10ce17c2b78b3b6218278ea70e2cde49a5 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Mon, 12 Jan 2026 13:53:29 -0800 Subject: [PATCH 01/10] reenable chunked pack benchmarks Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index dde7f2f48..40db6d52b 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. * SPDX-License-Identifier: Apache-2.0 */ @@ -85,8 +85,7 @@ static void BM_Pack_device(benchmark::State& state) { * @brief Benchmark for cudf::pack with pinned memory */ static void BM_Pack_pinned(benchmark::State& state) { - state.SkipWithMessage("Skipping until cudf#20886 is fixed"); - /* if (!rapidsmpf::is_pinned_memory_resources_supported()) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); return; } @@ -102,7 +101,7 @@ static void BM_Pack_pinned(benchmark::State& state) { }; rapidsmpf::PinnedMemoryResource pinned_mr; - run_pack(state, table_size_mb, pool_mr, pinned_mr, stream); */ + run_pack(state, table_size_mb, pool_mr, pinned_mr, stream); } /** @@ -206,8 +205,7 @@ static void BM_ChunkedPack_device(benchmark::State& state) { * @brief Benchmark for cudf::chunked_pack pinned memory */ static void BM_ChunkedPack_pinned(benchmark::State& state) { - state.SkipWithMessage("Skipping until cudf#20886 is fixed"); - /* if (!rapidsmpf::is_pinned_memory_resources_supported()) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); return; } @@ -228,7 +226,7 @@ static void BM_ChunkedPack_pinned(benchmark::State& state) { run_chunked_pack( state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream - ); */ + ); } // Custom argument generator for the benchmark @@ -286,8 +284,7 @@ static void BM_ChunkedPack_fixed_table_device(benchmark::State& state) { * and keeping table size fixed at 1GB */ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) { - state.SkipWithMessage("Skipping until cudf#20886 is fixed"); - /* if (!rapidsmpf::is_pinned_memory_resources_supported()) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); return; } @@ -305,7 +302,7 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) { run_chunked_pack( state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream - ); */ + ); } // Custom argument generator for the benchmark From aed3a27bd54990a03ad683792507352ecee9a206 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Mon, 12 Jan 2026 16:38:23 -0800 Subject: [PATCH 02/10] extending bench Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 200 +++++++++++++++++++++++++++++----- 1 file changed, 173 insertions(+), 27 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index 40db6d52b..4c136fc1c 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -19,6 +19,8 @@ #include #include +#include +#include #include #include "utils/random_data.hpp" @@ -33,11 +35,12 @@ constexpr std::size_t MB = 1024 * 1024; * @param pack_mr The memory resource for the packed data * @param stream The CUDA stream to use */ -void run_pack( +void run_pack_and_copy( benchmark::State& state, std::size_t table_size_mb, rmm::device_async_resource_ref table_mr, rmm::device_async_resource_ref pack_mr, + rmm::host_async_resource_ref dest_mr, rmm::cuda_stream_view stream ) { auto const table_size_bytes = table_size_mb * MB; @@ -49,11 +52,21 @@ void run_pack( // Warm up auto warm_up = cudf::pack(table.view(), stream, pack_mr); + + rapidsmpf::HostBuffer dest(warm_up.gpu_data->size(), stream, dest_mr); stream.synchronize(); for (auto _ : state) { auto packed = cudf::pack(table.view(), stream, pack_mr); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + dest.data(), + packed.gpu_data->data(), + packed.gpu_data->size(), + cudaMemcpyDefault, + stream + )); benchmark::DoNotOptimize(packed); + benchmark::DoNotOptimize(dest); stream.synchronize(); } @@ -68,7 +81,7 @@ void run_pack( /** * @brief Benchmark for cudf::pack with device memory */ -static void BM_Pack_device(benchmark::State& state) { +static void BM_Pack_device_copy_to_host(benchmark::State& state) { auto const table_size_mb = static_cast(state.range(0)); rmm::cuda_stream_view stream = rmm::cuda_stream_default; @@ -78,13 +91,31 @@ static void BM_Pack_device(benchmark::State& state) { rmm::mr::pool_memory_resource pool_mr{ cuda_mr, rmm::percent_of_free_device_memory(40) }; - run_pack(state, table_size_mb, pool_mr, pool_mr, stream); + rapidsmpf::HostMemoryResource host_mr; + run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, host_mr, stream); +} + +/** + * @brief Benchmark for cudf::pack with device memory + */ +static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::PinnedMemoryResource pinned_mr; + run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, pinned_mr, stream); } /** * @brief Benchmark for cudf::pack with pinned memory */ -static void BM_Pack_pinned(benchmark::State& state) { +static void BM_Pack_pinned_copy_to_host(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); return; @@ -100,8 +131,32 @@ static void BM_Pack_pinned(benchmark::State& state) { cuda_mr, rmm::percent_of_free_device_memory(40) }; rapidsmpf::PinnedMemoryResource pinned_mr; + rapidsmpf::HostMemoryResource dest_mr; - run_pack(state, table_size_mb, pool_mr, pinned_mr, stream); + run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, dest_mr, stream); +} + +/** + * @brief Benchmark for cudf::pack with pinned memory + */ +static void BM_Pack_pinned_copy_to_pinned_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, pinned_mr, stream); } /** @@ -111,14 +166,19 @@ static void BM_Pack_pinned(benchmark::State& state) { * @param table_size The size of the table in bytes * @param table_mr The memory resource for the table * @param pack_mr The memory resource for the packed data + * @param dest_mr The memory resource for the destination data * @param stream The CUDA stream to use + * + * @tparam DestinationBufferType The type of the destination buffer */ +template void run_chunked_pack( benchmark::State& state, std::size_t bounce_buffer_size, std::size_t table_size, rmm::device_async_resource_ref table_mr, rmm::device_async_resource_ref pack_mr, + auto& dest_mr, rmm::cuda_stream_view stream ) { // Calculate number of rows for a single-column table of the desired size @@ -134,7 +194,7 @@ void run_chunked_pack( // Allocate bounce buffer and destination buffer using the pack_mr rmm::device_buffer bounce_buffer(bounce_buffer_size, stream, pack_mr); - rmm::device_buffer destination(total_size, stream, pack_mr); + DestinationBufferType destination(total_size, stream, dest_mr); auto run_packer = [&] { cudf::chunked_pack packer(table.view(), bounce_buffer_size, stream, pack_mr); @@ -147,7 +207,7 @@ void run_chunked_pack( ) ); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - static_cast(destination.data()) + offset, + reinterpret_cast(destination.data()) + offset, bounce_buffer.data(), bytes_copied, cudaMemcpyDefault, @@ -182,7 +242,7 @@ void run_chunked_pack( /** * @brief Benchmark for cudf::chunked_pack with device memory */ -static void BM_ChunkedPack_device(benchmark::State& state) { +static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) { auto const table_size_mb = static_cast(state.range(0)); auto const table_size_bytes = table_size_mb * MB; @@ -195,21 +255,70 @@ static void BM_ChunkedPack_device(benchmark::State& state) { rmm::mr::pool_memory_resource pool_mr{ cuda_mr, rmm::percent_of_free_device_memory(40) }; + rapidsmpf::PinnedMemoryResource pinned_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, stream + state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream ); } /** - * @brief Benchmark for cudf::chunked_pack pinned memory + * @brief Benchmark for cudf::chunked_pack with device memory, copying to host memory. + * @param state The benchmark state containing the table size in MB as the first range + * argument. */ -static void BM_ChunkedPack_pinned(benchmark::State& state) { - if (!rapidsmpf::is_pinned_memory_resources_supported()) { - state.SkipWithMessage("Pinned memory resources are not supported"); - return; - } +static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::HostMemoryResource host_mr; + + run_chunked_pack( + state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, host_mr, stream + ); +} + +/** + * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to pinned host + * memory. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_chunked_pack( + state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream + ); +} + +/** + * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to host memory. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) { auto const table_size_mb = static_cast(state.range(0)); auto const table_size_bytes = table_size_mb * MB; @@ -223,13 +332,20 @@ static void BM_ChunkedPack_pinned(benchmark::State& state) { cuda_mr, rmm::percent_of_free_device_memory(40) }; rapidsmpf::PinnedMemoryResource pinned_mr; + rapidsmpf::HostMemoryResource host_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream + state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, host_mr, stream ); } -// Custom argument generator for the benchmark +/** + * @brief Custom argument generator for pack benchmarks. + * + * Configures benchmarks to run with various table sizes ranging from 1MB to 4GB. + * + * @param b The benchmark to configure with arguments. + */ void PackArguments(benchmark::internal::Benchmark* b) { // Test different table sizes in MB (minimum 1MB as requested) for (auto size_mb : {1, 10, 100, 500, 1000, 2000, 4000}) { @@ -238,21 +354,45 @@ void PackArguments(benchmark::internal::Benchmark* b) { } // Register the benchmarks -BENCHMARK(BM_Pack_device) + +// Pack benchmarks +BENCHMARK(BM_Pack_device_copy_to_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_Pack_device_copy_to_pinned_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_Pack_pinned_copy_to_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_Pack_pinned_copy_to_pinned_host) ->Apply(PackArguments) ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK(BM_Pack_pinned) + +// Chunked pack benchmarks +BENCHMARK(BM_ChunkedPack_device_copy_to_pinned_host) ->Apply(PackArguments) ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK(BM_ChunkedPack_device) +BENCHMARK(BM_ChunkedPack_device_copy_to_host) ->Apply(PackArguments) ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK(BM_ChunkedPack_pinned) +BENCHMARK(BM_ChunkedPack_pinned_copy_to_pinned_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_pinned_copy_to_host) ->Apply(PackArguments) ->UseRealTime() ->Unit(benchmark::kMillisecond); @@ -274,8 +414,8 @@ static void BM_ChunkedPack_fixed_table_device(benchmark::State& state) { cuda_mr, rmm::percent_of_free_device_memory(40) }; - run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, stream + run_chunked_pack( + state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream ); } @@ -300,14 +440,20 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) { }; rapidsmpf::PinnedMemoryResource pinned_mr; - run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, stream + run_chunked_pack( + state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream ); } -// Custom argument generator for the benchmark +/** + * @brief Custom argument generator for chunked pack benchmarks with fixed table size. + * + * Configures benchmarks to run with various bounce buffer sizes ranging from 1MB to 1GB. + * + * @param b The benchmark to configure with arguments. + */ void ChunkedPackArguments(benchmark::internal::Benchmark* b) { - // Test different table sizes in MB (minimum 1MB as requested) + // Test different bounce buffer sizes in MB for (auto bounce_buf_sz_mb : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}) { b->Args({bounce_buf_sz_mb}); } From dea025b9c3b412cee386f9d889cc0bb45eaa4bdd Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 13 Jan 2026 14:20:48 -0800 Subject: [PATCH 03/10] more cases Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 153 +++++++++++++++++++++++++++++++--- 1 file changed, 143 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index 4c136fc1c..37f78b581 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -28,7 +28,80 @@ constexpr std::size_t MB = 1024 * 1024; /** - * @brief Runs the cudf::pack benchmark + * @brief Runs the cudf::pack using a device-accessible memory resource + * @param state The benchmark state + * @param table_size_mb The size of the table in MB + * @param table_mr The memory resource for the table + * @param pack_mr The memory resource for the packed data + * @param stream The CUDA stream to use + */ +void run_pack( + benchmark::State& state, + std::size_t table_size_mb, + rmm::device_async_resource_ref table_mr, + rmm::device_async_resource_ref pack_mr, + rmm::cuda_stream_view stream +) { + auto const table_size_bytes = table_size_mb * MB; + + // Calculate number of rows for a single-column table of the desired size + auto const nrows = + static_cast(table_size_bytes / sizeof(random_data_t)); + auto table = random_table(1, nrows, 0, 1000, stream, table_mr); + + // Warm up + auto warm_up = cudf::pack(table.view(), stream, pack_mr); + stream.synchronize(); + + for (auto _ : state) { + auto packed = cudf::pack(table.view(), stream, pack_mr); + benchmark::DoNotOptimize(packed); + stream.synchronize(); + } + + state.SetBytesProcessed( + static_cast(state.iterations()) + * static_cast(table_size_bytes) + ); + state.counters["table_size_mb"] = static_cast(table_size_mb); + state.counters["num_rows"] = nrows; +} + +/** + * @brief Benchmark for cudf::pack with device memory + */ +static void BM_Pack_device(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + run_pack(state, table_size_mb, pool_mr, pool_mr, stream); +} + +/** + * @brief Benchmark for cudf::pack with device memory + */ +static void BM_Pack_pinned_host(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::PinnedMemoryResource pinned_mr; + run_pack(state, table_size_mb, pool_mr, pinned_mr, stream); +} + +/** + * @brief Runs the cudf::pack and copy the packed data to a host buffer * @param state The benchmark state * @param table_size_mb The size of the table in MB * @param table_mr The memory resource for the table @@ -99,6 +172,11 @@ static void BM_Pack_device_copy_to_host(benchmark::State& state) { * @brief Benchmark for cudf::pack with device memory */ static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + auto const table_size_mb = static_cast(state.range(0)); rmm::cuda_stream_view stream = rmm::cuda_stream_default; @@ -207,7 +285,7 @@ void run_chunked_pack( ) ); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( - reinterpret_cast(destination.data()) + offset, + static_cast(destination.data()) + offset, bounce_buffer.data(), bytes_copied, cudaMemcpyDefault, @@ -217,7 +295,7 @@ void run_chunked_pack( } }; - { + { // Warm up run_packer(); stream.synchronize(); } @@ -240,9 +318,40 @@ void run_chunked_pack( } /** - * @brief Benchmark for cudf::chunked_pack with device memory + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination + * buffer. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_device_copy_to_device(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + + run_chunked_pack( + state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream + ); +} + +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and pinned host + * destination buffer. */ static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + auto const table_size_mb = static_cast(state.range(0)); auto const table_size_bytes = table_size_mb * MB; @@ -263,7 +372,8 @@ static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) { } /** - * @brief Benchmark for cudf::chunked_pack with device memory, copying to host memory. + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and host destination + * buffer. * @param state The benchmark state containing the table size in MB as the first range * argument. */ @@ -288,12 +398,17 @@ static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) { } /** - * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to pinned host - * memory. + * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer and pinned host + * destination buffer. memory. * @param state The benchmark state containing the table size in MB as the first range * argument. */ static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + auto const table_size_mb = static_cast(state.range(0)); auto const table_size_bytes = table_size_mb * MB; @@ -314,11 +429,17 @@ static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) { } /** - * @brief Benchmark for cudf::chunked_pack with pinned memory, copying to host memory. + * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer and host destination + * buffer. * @param state The benchmark state containing the table size in MB as the first range * argument. */ static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + auto const table_size_mb = static_cast(state.range(0)); auto const table_size_bytes = table_size_mb * MB; @@ -356,6 +477,17 @@ void PackArguments(benchmark::internal::Benchmark* b) { // Register the benchmarks // Pack benchmarks +BENCHMARK(BM_Pack_device) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_Pack_pinned_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +// Pack and copy benchmarks BENCHMARK(BM_Pack_device_copy_to_host) ->Apply(PackArguments) ->UseRealTime() @@ -371,12 +503,13 @@ BENCHMARK(BM_Pack_pinned_copy_to_host) ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK(BM_Pack_pinned_copy_to_pinned_host) +// Chunked pack benchmarks + +BENCHMARK(BM_ChunkedPack_device_copy_to_device) ->Apply(PackArguments) ->UseRealTime() ->Unit(benchmark::kMillisecond); -// Chunked pack benchmarks BENCHMARK(BM_ChunkedPack_device_copy_to_pinned_host) ->Apply(PackArguments) ->UseRealTime() From d4859968270ec0a62ac764fed0de7cd26f78c942 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 13 Jan 2026 14:26:23 -0800 Subject: [PATCH 04/10] remvoing case Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index 37f78b581..28f054ebf 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -214,29 +214,6 @@ static void BM_Pack_pinned_copy_to_host(benchmark::State& state) { run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, dest_mr, stream); } -/** - * @brief Benchmark for cudf::pack with pinned memory - */ -static void BM_Pack_pinned_copy_to_pinned_host(benchmark::State& state) { - if (!rapidsmpf::is_pinned_memory_resources_supported()) { - state.SkipWithMessage("Pinned memory resources are not supported"); - return; - } - - auto const table_size_mb = static_cast(state.range(0)); - - rmm::cuda_stream_view stream = rmm::cuda_stream_default; - - // Create memory resources - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; - rapidsmpf::PinnedMemoryResource pinned_mr; - - run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, pinned_mr, stream); -} - /** * @brief Runs the cudf::chunked_pack benchmark * @param state The benchmark state From fe1b08030a1b9419ce8640e4e38570d9f1540cdc Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 13 Jan 2026 15:27:05 -0800 Subject: [PATCH 05/10] bypass pinned Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index 28f054ebf..80f36b7da 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -87,6 +87,11 @@ static void BM_Pack_device(benchmark::State& state) { * @brief Benchmark for cudf::pack with device memory */ static void BM_Pack_pinned_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + auto const table_size_mb = static_cast(state.range(0)); rmm::cuda_stream_view stream = rmm::cuda_stream_default; From 44db8f74a2fa75ae918e12e225c527f92d135ce2 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Wed, 14 Jan 2026 10:44:47 -0800 Subject: [PATCH 06/10] chunked pack without bounce buffer Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 150 ++++++++++++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index 80f36b7da..b7025ea04 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -252,7 +252,7 @@ void run_chunked_pack( total_size = packer.get_total_contiguous_size(); } - // Allocate bounce buffer and destination buffer using the pack_mr + // Allocate bounce buffer using the pack_mr & destination buffer using the dest_mr rmm::device_buffer bounce_buffer(bounce_buffer_size, stream, pack_mr); DestinationBufferType destination(total_size, stream, dest_mr); @@ -261,11 +261,9 @@ void run_chunked_pack( std::size_t offset = 0; while (packer.has_next()) { - auto const bytes_copied = packer.next( - cudf::device_span( - static_cast(bounce_buffer.data()), bounce_buffer_size - ) - ); + auto const bytes_copied = packer.next(cudf::device_span( + static_cast(bounce_buffer.data()), bounce_buffer_size + )); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( static_cast(destination.data()) + offset, bounce_buffer.data(), @@ -442,6 +440,136 @@ static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) { ); } +/** + * @brief Runs the cudf::chunked_pack benchmark + * @param state The benchmark state + * @param bounce_buffer_size The size of the bounce buffer in bytes + * @param table_size The size of the table in bytes + * @param table_mr The memory resource for the table + * @param pack_mr The memory resource for the packed data + * @param dest_mr The memory resource for the destination data + * @param stream The CUDA stream to use + * + * @tparam DestinationBufferType The type of the destination buffer + */ +template +void run_chunked_pack_without_bounce_buffer( + benchmark::State& state, + std::size_t bounce_buffer_size, + std::size_t table_size, + rmm::device_async_resource_ref table_mr, + rmm::device_async_resource_ref pack_mr, + auto& dest_mr, + rmm::cuda_stream_view stream +) { + // Calculate number of rows for a single-column table of the desired size + auto const nrows = static_cast(table_size / sizeof(random_data_t)); + auto table = random_table(1, nrows, 0, 1000, stream, table_mr); + + // Create the chunked_pack instance to get total output size + size_t total_size; + { + cudf::chunked_pack packer(table.view(), bounce_buffer_size, stream, table_mr); + // upper bound multiple of bounce buffer size + total_size = ((packer.get_total_contiguous_size() + bounce_buffer_size - 1) + / bounce_buffer_size) + * bounce_buffer_size; + } + + // Allocate the destination buffer + DestinationBufferType destination(total_size, stream, dest_mr); + + auto run_packer = [&] { + cudf::chunked_pack packer(table.view(), bounce_buffer_size, stream, pack_mr); + + std::size_t offset = 0; + while (packer.has_next()) { + auto const bytes_copied = packer.next(cudf::device_span( + reinterpret_cast(destination.data()) + offset, + bounce_buffer_size + )); + offset += bytes_copied; + } + }; + + { // Warm up + run_packer(); + stream.synchronize(); + } + + for (auto _ : state) { + run_packer(); + benchmark::DoNotOptimize(destination); + stream.synchronize(); + } + + state.SetBytesProcessed( + static_cast(state.iterations()) + * static_cast(table_size) + ); + state.counters["table_size_mb"] = + static_cast(table_size) / static_cast(MB); + state.counters["num_rows"] = nrows; + state.counters["bounce_buffer_mb"] = + static_cast(bounce_buffer_size) / static_cast(MB); +} + +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination + * buffer. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_device(benchmark::State& state) { + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + + run_chunked_pack_without_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream + ); +} + +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination + * buffer. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_pinned(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_chunked_pack_without_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream + ); +} + /** * @brief Custom argument generator for pack benchmarks. * @@ -487,6 +615,16 @@ BENCHMARK(BM_Pack_pinned_copy_to_host) // Chunked pack benchmarks +BENCHMARK(BM_ChunkedPack_device) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_pinned) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + BENCHMARK(BM_ChunkedPack_device_copy_to_device) ->Apply(PackArguments) ->UseRealTime() From 73b38a6cf00eb06563c0453529f5d8e9578adee2 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Wed, 14 Jan 2026 11:55:25 -0800 Subject: [PATCH 07/10] another bench Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 40 +++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index b7025ea04..c994ffbe8 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -545,7 +545,7 @@ static void BM_ChunkedPack_device(benchmark::State& state) { * @param state The benchmark state containing the table size in MB as the first range * argument. */ -static void BM_ChunkedPack_pinned(benchmark::State& state) { +static void BM_ChunkedPack_pinned_device_mr(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); return; @@ -570,6 +570,37 @@ static void BM_ChunkedPack_pinned(benchmark::State& state) { ); } +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination + * buffer. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_pinned_pinned_mr(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr{ + cuda_mr, rmm::percent_of_free_device_memory(40) + }; + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_chunked_pack_without_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream + ); +} + /** * @brief Custom argument generator for pack benchmarks. * @@ -620,7 +651,12 @@ BENCHMARK(BM_ChunkedPack_device) ->UseRealTime() ->Unit(benchmark::kMillisecond); -BENCHMARK(BM_ChunkedPack_pinned) +BENCHMARK(BM_ChunkedPack_pinned_device_mr) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_pinned_pinned_mr) ->Apply(PackArguments) ->UseRealTime() ->Unit(benchmark::kMillisecond); From 47a07171d150dc3551de97568ef3d48e5e91c628 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Thu, 15 Jan 2026 10:31:05 -0800 Subject: [PATCH 08/10] addressing PR comments and fixed sized cases Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 384 ++++++++++++++++++++++++++-------- 1 file changed, 302 insertions(+), 82 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index c994ffbe8..2a356ba32 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -9,6 +9,8 @@ #include +#include + #include #include #include @@ -17,7 +19,6 @@ #include #include #include -#include #include #include @@ -75,12 +76,9 @@ static void BM_Pack_device(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; - // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; - run_pack(state, table_size_mb, pool_mr, pool_mr, stream); + run_pack(state, table_size_mb, cuda_mr, cuda_mr, stream); } /** @@ -96,13 +94,10 @@ static void BM_Pack_pinned_host(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; - // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; - run_pack(state, table_size_mb, pool_mr, pinned_mr, stream); + run_pack(state, table_size_mb, cuda_mr, pinned_mr, stream); } /** @@ -132,6 +127,13 @@ void run_pack_and_copy( auto warm_up = cudf::pack(table.view(), stream, pack_mr); rapidsmpf::HostBuffer dest(warm_up.gpu_data->size(), stream, dest_mr); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + dest.data(), + warm_up.gpu_data->data(), + warm_up.gpu_data->size(), + cudaMemcpyDefault, + stream + )); stream.synchronize(); for (auto _ : state) { @@ -164,13 +166,10 @@ static void BM_Pack_device_copy_to_host(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; - // Create memory resources + rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::HostMemoryResource host_mr; - run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, host_mr, stream); + run_pack_and_copy(state, table_size_mb, cuda_mr, cuda_mr, host_mr, stream); } /** @@ -186,13 +185,9 @@ static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; - // Create memory resources rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; - run_pack_and_copy(state, table_size_mb, pool_mr, pool_mr, pinned_mr, stream); + run_pack_and_copy(state, table_size_mb, cuda_mr, cuda_mr, pinned_mr, stream); } /** @@ -208,15 +203,11 @@ static void BM_Pack_pinned_copy_to_host(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; - // Create memory resources rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; rapidsmpf::HostMemoryResource dest_mr; - run_pack_and_copy(state, table_size_mb, pool_mr, pinned_mr, dest_mr, stream); + run_pack_and_copy(state, table_size_mb, cuda_mr, pinned_mr, dest_mr, stream); } /** @@ -261,9 +252,11 @@ void run_chunked_pack( std::size_t offset = 0; while (packer.has_next()) { - auto const bytes_copied = packer.next(cudf::device_span( - static_cast(bounce_buffer.data()), bounce_buffer_size - )); + auto const bytes_copied = packer.next( + cudf::device_span( + static_cast(bounce_buffer.data()), bounce_buffer_size + ) + ); RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( static_cast(destination.data()) + offset, bounce_buffer.data(), @@ -313,12 +306,9 @@ static void BM_ChunkedPack_device_copy_to_device(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream ); } @@ -341,13 +331,10 @@ static void BM_ChunkedPack_device_copy_to_pinned_host(benchmark::State& state) { rmm::cuda_stream_view stream = rmm::cuda_stream_default; rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, pinned_mr, stream ); } @@ -365,15 +352,11 @@ static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) { auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); rmm::cuda_stream_view stream = rmm::cuda_stream_default; - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::HostMemoryResource host_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, host_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, host_mr, stream ); } @@ -396,15 +379,11 @@ static void BM_ChunkedPack_pinned_copy_to_pinned_host(benchmark::State& state) { auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); rmm::cuda_stream_view stream = rmm::cuda_stream_default; - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream ); } @@ -427,16 +406,12 @@ static void BM_ChunkedPack_pinned_copy_to_host(benchmark::State& state) { auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); rmm::cuda_stream_view stream = rmm::cuda_stream_default; - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; rapidsmpf::HostMemoryResource host_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, host_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, host_mr, stream ); } @@ -484,10 +459,12 @@ void run_chunked_pack_without_bounce_buffer( std::size_t offset = 0; while (packer.has_next()) { - auto const bytes_copied = packer.next(cudf::device_span( - reinterpret_cast(destination.data()) + offset, - bounce_buffer_size - )); + auto const bytes_copied = packer.next( + cudf::device_span( + reinterpret_cast(destination.data()) + offset, + bounce_buffer_size + ) + ); offset += bytes_copied; } }; @@ -528,14 +505,10 @@ static void BM_ChunkedPack_device(benchmark::State& state) { auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); rmm::cuda_stream_view stream = rmm::cuda_stream_default; - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; run_chunked_pack_without_bounce_buffer( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream ); } @@ -558,15 +531,11 @@ static void BM_ChunkedPack_pinned_device_mr(benchmark::State& state) { auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); rmm::cuda_stream_view stream = rmm::cuda_stream_default; - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; run_chunked_pack_without_bounce_buffer( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pinned_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, pinned_mr, stream ); } @@ -589,15 +558,254 @@ static void BM_ChunkedPack_pinned_pinned_mr(benchmark::State& state) { auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::mr::cuda_async_memory_resource cuda_mr; + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_chunked_pack_without_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream + ); +} + +/** + * @brief Runs the cudf::chunked_pack benchmark + * @param state The benchmark state + * @param bounce_buffer_size The size of the bounce buffer in bytes + * @param table_size The size of the table in bytes + * @param table_mr The memory resource for the table + * @param pack_mr The memory resource for the packed data + * @param dest_mr The memory resource for the destination data + * @param stream The CUDA stream to use + * + * @tparam DestinationBufferType The type of the destination buffer + */ +void run_chunked_pack_with_fixed_sized_host_buffers( + benchmark::State& state, + std::size_t fixed_buffer_size, + std::size_t table_size, + rmm::device_async_resource_ref table_mr, + rmm::device_async_resource_ref pack_mr, + rmm::host_async_resource_ref host_mr, + rmm::cuda_stream_view stream +) { + // Calculate number of rows for a single-column table of the desired size + auto const nrows = static_cast(table_size / sizeof(random_data_t)); + auto table = random_table(1, nrows, 0, 1000, stream, table_mr); + + // Create the chunked_pack instance to get total output size + size_t n_buffers; + { + cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, table_mr); + // upper bound multiple of bounce buffer size + n_buffers = (packer.get_total_contiguous_size() + fixed_buffer_size - 1) + / fixed_buffer_size; + } + + // Allocate fixed sized host buffers for the destination + std::vector fixed_host_buffers; + for (size_t i = 0; i < n_buffers; i++) { + fixed_host_buffers.emplace_back(fixed_buffer_size, stream, host_mr); + } + + auto run_packer = [&] { + cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, pack_mr); + + std::size_t buffer_idx = 0; + while (packer.has_next()) { + std::ignore = packer.next( + cudf::device_span( + reinterpret_cast( + fixed_host_buffers[buffer_idx].data() + ), + fixed_buffer_size + ) + ); + buffer_idx++; + } + }; + { // Warm up + run_packer(); + stream.synchronize(); + } + + for (auto _ : state) { + run_packer(); + benchmark::DoNotOptimize(fixed_host_buffers); + stream.synchronize(); + } + + state.SetBytesProcessed( + static_cast(state.iterations()) + * static_cast(table_size) + ); + state.counters["table_size_mb"] = + static_cast(table_size) / static_cast(MB); + state.counters["num_rows"] = nrows; + state.counters["bounce_buffer_mb"] = 0; +} + +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination + * buffer. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ +static void BM_ChunkedPack_pinned_to_fixed_sized_pinned(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_chunked_pack_with_fixed_sized_host_buffers( + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream + ); +} + +void run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer( + benchmark::State& state, + std::size_t fixed_buffer_size, + std::size_t table_size, + rmm::device_async_resource_ref table_mr, + rmm::device_async_resource_ref pack_mr, + rmm::host_async_resource_ref host_mr, + rmm::cuda_stream_view stream +) { + // Calculate number of rows for a single-column table of the desired size + auto const nrows = static_cast(table_size / sizeof(random_data_t)); + auto table = random_table(1, nrows, 0, 1000, stream, table_mr); + + // Create the chunked_pack instance to get total output size + size_t n_buffers; + { + cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, table_mr); + // upper bound multiple of bounce buffer size + n_buffers = (packer.get_total_contiguous_size() + fixed_buffer_size - 1) + / fixed_buffer_size; + } + + // Allocate fixed sized host buffers for the destination + std::vector fixed_host_buffers; + for (size_t i = 0; i < n_buffers; i++) { + fixed_host_buffers.emplace_back(fixed_buffer_size, stream, host_mr); + } + + rmm::device_buffer bounce_buffer(fixed_buffer_size, stream, pack_mr); + + auto run_packer = [&] { + cudf::chunked_pack packer(table.view(), fixed_buffer_size, stream, pack_mr); + + std::size_t buffer_idx = 0; + while (packer.has_next()) { + auto const bytes_copied = packer.next( + cudf::device_span( + static_cast(bounce_buffer.data()), fixed_buffer_size + ) + ); + RAPIDSMPF_CUDA_TRY(cudaMemcpyAsync( + static_cast(fixed_host_buffers[buffer_idx].data()), + bounce_buffer.data(), + bytes_copied, + cudaMemcpyDefault, + stream.value() + )); + buffer_idx++; + } }; + + { // Warm up + run_packer(); + stream.synchronize(); + } + + for (auto _ : state) { + run_packer(); + benchmark::DoNotOptimize(bounce_buffer); + benchmark::DoNotOptimize(fixed_host_buffers); + stream.synchronize(); + } + + state.SetBytesProcessed( + static_cast(state.iterations()) + * static_cast(table_size) + ); + state.counters["table_size_mb"] = + static_cast(table_size) / static_cast(MB); + state.counters["num_rows"] = nrows; + state.counters["bounce_buffer_mb"] = + static_cast(bounce_buffer.size()) / static_cast(MB); +} + +static void BM_ChunkedPack_device_to_fixed_sized_pinned(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::mr::cuda_async_memory_resource cuda_mr; rapidsmpf::PinnedMemoryResource pinned_mr; - run_chunked_pack_without_bounce_buffer( - state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream + run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, pinned_mr, stream + ); +} + +static void BM_ChunkedPack_device_to_fixed_sized_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::mr::cuda_async_memory_resource cuda_mr; + rapidsmpf::HostMemoryResource host_mr; + + run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, host_mr, stream + ); +} + +static void BM_ChunkedPack_pinned_to_fixed_sized_host(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const table_size_mb = static_cast(state.range(0)); + auto const table_size_bytes = table_size_mb * MB; + + // Bounce buffer size: max(1MB, table_size / 10) + auto const bounce_buffer_size = std::max(MB, table_size_bytes / 10); + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::mr::cuda_async_memory_resource cuda_mr; + rapidsmpf::PinnedMemoryResource pinned_mr; + rapidsmpf::HostMemoryResource host_mr; + + run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, host_mr, stream ); } @@ -686,6 +894,28 @@ BENCHMARK(BM_ChunkedPack_pinned_copy_to_host) ->UseRealTime() ->Unit(benchmark::kMillisecond); +// Chunked pack with fixed sized host buffers and bounce buffer benchmarks + +BENCHMARK(BM_ChunkedPack_device_to_fixed_sized_pinned) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_device_to_fixed_sized_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_pinned_to_fixed_sized_pinned) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_pinned_to_fixed_sized_host) + ->Apply(PackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + /** * @brief Benchmark for cudf::chunked_pack in device memory varying the bounce buffer size * and keeping table size fixed at 1GB @@ -695,16 +925,10 @@ static void BM_ChunkedPack_fixed_table_device(benchmark::State& state) { constexpr std::size_t table_size_bytes = 1024 * MB; rmm::cuda_stream_view stream = rmm::cuda_stream_default; - - // Create memory resources rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; - run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pool_mr, pool_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream ); } @@ -722,15 +946,11 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) { constexpr std::size_t table_size_bytes = 1024 * MB; rmm::cuda_stream_view stream = rmm::cuda_stream_default; - rmm::mr::cuda_async_memory_resource cuda_mr; - rmm::mr::pool_memory_resource pool_mr{ - cuda_mr, rmm::percent_of_free_device_memory(40) - }; rapidsmpf::PinnedMemoryResource pinned_mr; run_chunked_pack( - state, bounce_buffer_size, table_size_bytes, pool_mr, pinned_mr, pinned_mr, stream + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream ); } From 7126e01847851e51acbd46b9494e9e694579ae55 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 20 Jan 2026 12:47:36 -0800 Subject: [PATCH 09/10] extending benchmarks Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 93 ++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index 2a356ba32..2b7c4a9e3 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -66,6 +66,7 @@ void run_pack( ); state.counters["table_size_mb"] = static_cast(table_size_mb); state.counters["num_rows"] = nrows; + state.counters["bounce_buffer_mb"] = 0; } /** @@ -82,7 +83,7 @@ static void BM_Pack_device(benchmark::State& state) { } /** - * @brief Benchmark for cudf::pack with device memory + * @brief Benchmark for cudf::pack with pinned host memory as the pack destination. */ static void BM_Pack_pinned_host(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { @@ -156,10 +157,11 @@ void run_pack_and_copy( ); state.counters["table_size_mb"] = static_cast(table_size_mb); state.counters["num_rows"] = nrows; + state.counters["bounce_buffer_mb"] = 0; } /** - * @brief Benchmark for cudf::pack with device memory + * @brief Benchmark for cudf::pack with device memory and copy to host memory. */ static void BM_Pack_device_copy_to_host(benchmark::State& state) { auto const table_size_mb = static_cast(state.range(0)); @@ -173,7 +175,7 @@ static void BM_Pack_device_copy_to_host(benchmark::State& state) { } /** - * @brief Benchmark for cudf::pack with device memory + * @brief Benchmark for cudf::pack with device memory and copy to pinned host memory. */ static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { @@ -191,7 +193,7 @@ static void BM_Pack_device_copy_to_pinned_host(benchmark::State& state) { } /** - * @brief Benchmark for cudf::pack with pinned memory + * @brief Benchmark for cudf::pack with pinned memory and copy to host memory. */ static void BM_Pack_pinned_copy_to_host(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { @@ -362,7 +364,7 @@ static void BM_ChunkedPack_device_copy_to_host(benchmark::State& state) { /** * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer and pinned host - * destination buffer. memory. + * destination buffer. * @param state The benchmark state containing the table size in MB as the first range * argument. */ @@ -492,8 +494,7 @@ void run_chunked_pack_without_bounce_buffer( } /** - * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination - * buffer. + * @brief Benchmark for cudf::chunked_pack directly into device memory (no bounce buffer). * @param state The benchmark state containing the table size in MB as the first range * argument. */ @@ -513,8 +514,8 @@ static void BM_ChunkedPack_device(benchmark::State& state) { } /** - * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination - * buffer. + * @brief Benchmark for cudf::chunked_pack directly into pinned host memory with device + * pack memory resource (no bounce buffer). * @param state The benchmark state containing the table size in MB as the first range * argument. */ @@ -540,8 +541,8 @@ static void BM_ChunkedPack_pinned_device_mr(benchmark::State& state) { } /** - * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination - * buffer. + * @brief Benchmark for cudf::chunked_pack directly into pinned host memory with pinned + * pack memory resource (no bounce buffer). * @param state The benchmark state containing the table size in MB as the first range * argument. */ @@ -645,8 +646,8 @@ void run_chunked_pack_with_fixed_sized_host_buffers( } /** - * @brief Benchmark for cudf::chunked_pack with device bounce buffer and destination - * buffer. + * @brief Benchmark for cudf::chunked_pack directly into fixed-sized pinned host buffers + * with pinned pack memory resource (no bounce buffer). * @param state The benchmark state containing the table size in MB as the first range * argument. */ @@ -745,6 +746,12 @@ void run_chunked_pack_with_fixed_sized_host_buffers_and_bounce_buffer( static_cast(bounce_buffer.size()) / static_cast(MB); } +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer copying to + * fixed-sized pinned host buffers. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ static void BM_ChunkedPack_device_to_fixed_sized_pinned(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); @@ -766,6 +773,12 @@ static void BM_ChunkedPack_device_to_fixed_sized_pinned(benchmark::State& state) ); } +/** + * @brief Benchmark for cudf::chunked_pack with device bounce buffer copying to + * fixed-sized host buffers. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ static void BM_ChunkedPack_device_to_fixed_sized_host(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); @@ -787,6 +800,12 @@ static void BM_ChunkedPack_device_to_fixed_sized_host(benchmark::State& state) { ); } +/** + * @brief Benchmark for cudf::chunked_pack with pinned bounce buffer copying to + * fixed-sized host buffers. + * @param state The benchmark state containing the table size in MB as the first range + * argument. + */ static void BM_ChunkedPack_pinned_to_fixed_sized_host(benchmark::State& state) { if (!rapidsmpf::is_pinned_memory_resources_supported()) { state.SkipWithMessage("Pinned memory resources are not supported"); @@ -954,6 +973,44 @@ static void BM_ChunkedPack_fixed_table_pinned(benchmark::State& state) { ); } +/** + * @brief Benchmark for cudf::chunked_pack in device memory varying the bounce buffer size + * and keeping table size fixed at 1GB + */ +static void BM_ChunkedPack_fixed_table_device_no_bounce_buffer(benchmark::State& state) { + auto const bounce_buffer_size = static_cast(state.range(0)) * MB; + constexpr std::size_t table_size_bytes = 1024 * MB; + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::mr::cuda_async_memory_resource cuda_mr; + + run_chunked_pack_without_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, cuda_mr, cuda_mr, cuda_mr, stream + ); +} + +/** + * @brief Benchmark for cudf::chunked_pack in pinned memory varying the bounce buffer size + * and keeping table size fixed at 1GB + */ +static void BM_ChunkedPack_fixed_table_pinned_no_bounce_buffer(benchmark::State& state) { + if (!rapidsmpf::is_pinned_memory_resources_supported()) { + state.SkipWithMessage("Pinned memory resources are not supported"); + return; + } + + auto const bounce_buffer_size = static_cast(state.range(0)) * MB; + constexpr std::size_t table_size_bytes = 1024 * MB; + + rmm::cuda_stream_view stream = rmm::cuda_stream_default; + rmm::mr::cuda_async_memory_resource cuda_mr; + rapidsmpf::PinnedMemoryResource pinned_mr; + + run_chunked_pack_without_bounce_buffer( + state, bounce_buffer_size, table_size_bytes, cuda_mr, pinned_mr, pinned_mr, stream + ); +} + /** * @brief Custom argument generator for chunked pack benchmarks with fixed table size. * @@ -978,4 +1035,14 @@ BENCHMARK(BM_ChunkedPack_fixed_table_pinned) ->UseRealTime() ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_ChunkedPack_fixed_table_device_no_bounce_buffer) + ->Apply(ChunkedPackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_ChunkedPack_fixed_table_pinned_no_bounce_buffer) + ->Apply(ChunkedPackArguments) + ->UseRealTime() + ->Unit(benchmark::kMillisecond); + BENCHMARK_MAIN(); From 2fc4c36dd0bd84d7b9ee48297aed0a56b84a3350 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 20 Jan 2026 12:54:29 -0800 Subject: [PATCH 10/10] precommit Signed-off-by: niranda perera --- cpp/benchmarks/bench_pack.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/bench_pack.cpp b/cpp/benchmarks/bench_pack.cpp index a3817c281..3bd66ecc8 100644 --- a/cpp/benchmarks/bench_pack.cpp +++ b/cpp/benchmarks/bench_pack.cpp @@ -1,5 +1,5 @@ /** - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */