From 73df919129b7892b6a8cef11afa0197a7cbc693e Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Thu, 31 Jul 2025 10:47:42 +0100 Subject: [PATCH 1/2] Re-add buggy shared arg allocation for benchmarking. --- sycl/source/detail/queue_impl.cpp | 7 +++++++ sycl/source/detail/queue_impl.hpp | 10 ++++++++++ sycl/source/detail/scheduler/commands.cpp | 3 +-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 24ed44b219e3a..27b62c0118ee8 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -859,6 +859,13 @@ void queue_impl::verifyProps(const property_list &Props) const { CheckPropertiesWithData); } +std::vector & +queue_impl::getKernelArgStorage(uint32_t size) { + MKernelArgStorage.clear(); + MKernelArgStorage.reserve(size); + return MKernelArgStorage; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 7e15c772f2697..f0c7b7c1488f7 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -193,6 +193,7 @@ class queue_impl : public std::enable_shared_from_this { const async_handler &AsyncHandler, const property_list &PropList, private_tag) : MDevice([&]() -> device_impl & { + MKernelArgStorage.reserve(10); ur_device_handle_t DeviceUr{}; adapter_impl &Adapter = Context.getAdapter(); // TODO catch an exception and put it to list of asynchronous @@ -683,6 +684,11 @@ class queue_impl : public std::enable_shared_from_this { } #endif + /// Clears MKernelArgsStorage, has it .reserve(size), and returns a reference + /// to it. Not inherently thread safe. + std::vector & + getKernelArgStorage(uint32_t size); + protected: template EventImplPtr insertHelperBarrier(const HandlerType &Handler) { @@ -999,6 +1005,10 @@ class queue_impl : public std::enable_shared_from_this { ur_queue_handle_t MQueue; + // To avoid re-allocating this every time a kernel is enqueued we keep this + // vector around and .clear()/.reserve() for each kernel instead. + std::vector MKernelArgStorage; + // Access should be guarded with MMutex struct DependencyTrackingItems { // This event is employed for enhanced dependency tracking with in-order diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 70f12d0a59ef7..94c704db22f1e 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2424,8 +2424,7 @@ static ur_result_t SetKernelParamsAndLaunch( DeviceImageImpl ? DeviceImageImpl->get_spec_const_blob_ref() : Empty); } - std::vector UrArgs; - UrArgs.reserve(Args.size()); + auto UrArgs = Queue.getKernelArgStorage(Args.size()); if (KernelFuncPtr && !KernelHasSpecialCaptures) { auto setFunc = [&UrArgs, From 04bb00a582bf33fe096658686ebc2fe08b68d559 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Thu, 31 Jul 2025 14:47:31 +0100 Subject: [PATCH 2/2] Try without over-generous initial reservation. --- sycl/source/detail/queue_impl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index f0c7b7c1488f7..f61bb23df563d 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -193,7 +193,6 @@ class queue_impl : public std::enable_shared_from_this { const async_handler &AsyncHandler, const property_list &PropList, private_tag) : MDevice([&]() -> device_impl & { - MKernelArgStorage.reserve(10); ur_device_handle_t DeviceUr{}; adapter_impl &Adapter = Context.getAdapter(); // TODO catch an exception and put it to list of asynchronous