diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 1a2cdcb4c21e1..0ab21770dc8fa 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -946,6 +946,10 @@ struct AMDGPUKernelTy : public GenericKernelTy { if (ThreadLimitClause[0] > 0 && ThreadLimitClause[0] != (uint32_t)-1 && ThreadLimitClause[0] <= static_cast(ConstWGSize)) return llvm::omp::getBlockSizeAsPowerOfTwo(ThreadLimitClause[0]); + uint32_t BlockSizeOverride = GenericDevice.getOMPXXteamBlockSize(); + if (BlockSizeOverride > 0 && + BlockSizeOverride <= static_cast(ConstWGSize)) + return llvm::omp::getBlockSizeAsPowerOfTwo(BlockSizeOverride); assert(((ConstWGSize & (ConstWGSize - 1)) == 0) && "XTeam Reduction blocksize must be a power of two"); return ConstWGSize; @@ -3096,6 +3100,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { "LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS", 1), OMPX_GenericSpmdUseSmallBlockSize( "LIBOMPTARGET_AMDGPU_GENERIC_SPMD_USE_SMALL_BLOCKSIZE", 1), + OMPX_XteamBlockSize("LIBOMPTARGET_AMDGPU_XTEAM_BLOCKSIZE", 0), OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES", 64 * 1024), OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", @@ -3235,6 +3240,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { virtual bool getOMPXGenericSpmdUseSmallBlockSize() const override { return OMPX_GenericSpmdUseSmallBlockSize; } + virtual uint32_t getOMPXXteamBlockSize() const override { + return OMPX_XteamBlockSize; + } uint64_t getDeviceTimeStamp() override { return getSystemTimestampInNs(); } @@ -4872,6 +4880,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// be reduced and the corresponding number of teams adjusted. BoolEnvar OMPX_GenericSpmdUseSmallBlockSize; + /// Envar indicating the blocksize to be used for Xteam reduction kernels. The + /// default of 0 indicates that there is no runtime override and the value + /// indicated by CodeGen will be used. If a non-zero value is specified, the + /// runtime will attempt to use it as an override if other constraints are + /// satisfied. + UInt32Envar OMPX_XteamBlockSize; + /// Envar specifying the maximum size in bytes where the memory copies are /// asynchronous operations. Up to this transfer size, the memory copies are /// asynchronous operations pushed to the corresponding stream. For larger diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index d0f484210b785..2fb9410419aa5 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1164,6 +1164,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual bool getOMPXGenericSpmdUseSmallBlockSize() const { llvm_unreachable("Unimplemented"); } + virtual uint32_t getOMPXXteamBlockSize() const { + llvm_unreachable("Unimplemented"); + } /// Get target compute unit kind (e.g., sm_80, or gfx908). virtual std::string getComputeUnitKind() const { return "unknown"; }