diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 5df11a45b4889..e0110d33cf5f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -59,6 +59,7 @@ FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass(); FunctionPass *createAMDGPUReserveWWMRegsPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createAMDGPUTDMOptimizationPass(); ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); ModulePass *createAMDGPULowerBufferFatPointersPass(); @@ -170,6 +171,8 @@ extern char &AMDGPUPrepareAGPRAllocLegacyID; void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &); extern char &AMDGPUReserveWWMRegsLegacyID; +void initializeAMDGPUTDMOptimizationPass(PassRegistry &); + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp new file mode 100644 index 0000000000000..28ddece0c11e5 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp @@ -0,0 +1,515 @@ +//===-- AMDGPUTDMOptimization.cpp - TDM Descriptor Optimization ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes Tensor Data Movement (TDM) descriptor creation patterns. +// It identifies insertelement chains that create descriptors and transforms +// them to use alloca+field updates, which SROA later optimizes to +// INSERT_SUBREG. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-tdm-optimization" + +static cl::opt + TDMOptBenefitThreshold("amdgpu-tdm-opt-threshold", cl::Hidden, cl::init(10), + cl::desc("Minimum optimization benefit threshold " + "for TDM descriptor optimization")); + +namespace llvm { +void initializeAMDGPUTDMOptimizationPass(PassRegistry &); +} + +namespace { + +//===----------------------------------------------------------------------===// +// Pattern Detection Data Structures +//===----------------------------------------------------------------------===// + +/// Represents a single descriptor creation pattern +struct DescriptorPattern { + Type *DescType; ///< <4 x i32> or <8 x i32> + Value *BaseValue; ///< Base template (constant or computed) + SmallVector + Chain; ///< Chain of insertelement instructions + SmallVector VariableFields; ///< Fields that change + SmallVector ConstantFields; ///< Fields that stay constant + BasicBlock *Location; ///< Where the pattern is located + Loop *ContainingLoop; ///< Loop containing this pattern (if any) + + /// Calculate field reuse ratio (constant fields / total fields) + float getFieldReuseRatio() const { + unsigned totalFields = cast(DescType)->getNumElements(); + return (float)ConstantFields.size() / totalFields; + } + + /// Check if this pattern is worth optimizing + bool isWorthOptimizing() const { + // Always optimize if in loop with reuse potential + if (ContainingLoop && getFieldReuseRatio() >= 0.5f) + return true; + + // Optimize if significant field reuse + if (getFieldReuseRatio() >= 0.75f) + return true; + + // Optimize address descriptors (common case) + if (isAddressDescriptor() && ConstantFields.size() >= 1) + return true; + + return false; + } + + /// Check if this is an address descriptor (<4 x i32>) + bool isAddressDescriptor() const { + auto *VecTy = cast(DescType); + return VecTy->getNumElements() == 4 && + VecTy->getElementType()->isIntegerTy(32); + } + + /// Check if this is a tensor descriptor (<8 x i32>) + bool isTensorDescriptor() const { + auto *VecTy = cast(DescType); + return VecTy->getNumElements() == 8 && + VecTy->getElementType()->isIntegerTy(32); + } +}; + +/// Groups similar descriptor patterns for optimization +struct DescriptorGroup { + SmallVector Patterns; + Type *SharedType; + Value *SharedBase; ///< Common base value (if any) + SmallVector SharedConstantFields; + + /// Calculate total optimization benefit + unsigned getOptimizationBenefit() const { + unsigned benefit = 0; + for (const auto &pattern : Patterns) { + // Base benefit from field reuse + benefit += pattern.ConstantFields.size() * 2; + + // Extra benefit for loop patterns + if (pattern.ContainingLoop) + benefit *= 5; + } + return benefit; + } +}; + +//===----------------------------------------------------------------------===// +// AMDGPUTDMOptimization Pass +//===----------------------------------------------------------------------===// + +class AMDGPUTDMOptimization : public FunctionPass { +private: + LoopInfo *LI = nullptr; + + /// Detected patterns in the function + SmallVector DetectedPatterns; + + /// Groups of optimizable patterns + SmallVector OptimizationGroups; + +public: + static char ID; + + AMDGPUTDMOptimization() : FunctionPass(ID) { + initializeAMDGPUTDMOptimizationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + /// Main optimization phases + bool detectDescriptorPatterns(Function &F); + void groupSimilarPatterns(); + bool transformPatterns(Function &F); + + /// Pattern detection helpers + bool isDescriptorType(Type *Ty) const; + DescriptorPattern analyzeInsertChain(InsertElementInst *FinalInsert); + Value *extractBaseValue(const DescriptorPattern &Pattern); + + /// Transformation helpers + bool transformDescriptorGroup(DescriptorGroup &Group, Function &F); + Value *createSharedStorage(DescriptorGroup &Group, IRBuilder<> &Builder); + void transformSinglePattern(DescriptorPattern &Pattern, Value *SharedStorage, + IRBuilder<> &Builder); + + /// Utility functions + Loop *getContainingLoop(BasicBlock *BB); + bool arePatternsSimilar(const DescriptorPattern &A, + const DescriptorPattern &B); +}; + +//===----------------------------------------------------------------------===// +// Pass Implementation +//===----------------------------------------------------------------------===// + +bool AMDGPUTDMOptimization::runOnFunction(Function &F) { + LI = &getAnalysis().getLoopInfo(); + + LLVM_DEBUG(dbgs() << "Running TDM optimization on function: " << F.getName() + << "\n"); + + // Phase 1: Detect descriptor patterns + if (!detectDescriptorPatterns(F)) { + LLVM_DEBUG(dbgs() << "No descriptor patterns found\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Found " << DetectedPatterns.size() + << " descriptor patterns\n"); + + // Phase 2: Group similar patterns for optimization + groupSimilarPatterns(); + + LLVM_DEBUG(dbgs() << "Created " << OptimizationGroups.size() + << " optimization groups\n"); + + // Phase 3: Transform patterns + bool Changed = transformPatterns(F); + + // Cleanup for next function + DetectedPatterns.clear(); + OptimizationGroups.clear(); + + return Changed; +} + +void AMDGPUTDMOptimization::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); +} + +//===----------------------------------------------------------------------===// +// Pattern Detection +//===----------------------------------------------------------------------===// + +bool AMDGPUTDMOptimization::detectDescriptorPatterns(Function &F) { + bool FoundPatterns = false; + + // Scan function for insertelement instructions that create descriptors + for (auto &BB : F) { + for (auto &I : BB) { + auto *IE = dyn_cast(&I); + if (!IE || !isDescriptorType(IE->getType())) + continue; + + // Check if this is the final insert in a descriptor creation chain + if (!IE->hasOneUse() || isa(*IE->user_begin())) + continue; + + // Analyze the complete chain + DescriptorPattern Pattern = analyzeInsertChain(IE); + if (Pattern.Chain.empty()) + continue; + + // Check if worth optimizing + if (!Pattern.isWorthOptimizing()) { + LLVM_DEBUG( + dbgs() << "Pattern not worth optimizing: field reuse ratio = " + << Pattern.getFieldReuseRatio() << "\n"); + continue; + } + + LLVM_DEBUG( + dbgs() << "Found optimizable pattern: " + << (Pattern.isAddressDescriptor() ? "Address" : "Tensor") + << " descriptor with " << Pattern.ConstantFields.size() + << " constant fields\n"); + + DetectedPatterns.push_back(std::move(Pattern)); + FoundPatterns = true; + } + } + + return FoundPatterns; +} + +bool AMDGPUTDMOptimization::isDescriptorType(Type *Ty) const { + auto *VecTy = dyn_cast(Ty); + if (!VecTy || !VecTy->getElementType()->isIntegerTy(32)) + return false; + + unsigned NumElements = VecTy->getNumElements(); + return NumElements == 4 || NumElements == 8; // Address or tensor descriptors +} + +DescriptorPattern +AMDGPUTDMOptimization::analyzeInsertChain(InsertElementInst *FinalInsert) { + DescriptorPattern Pattern; + Pattern.DescType = FinalInsert->getType(); + Pattern.Location = FinalInsert->getParent(); + Pattern.ContainingLoop = getContainingLoop(Pattern.Location); + + // Trace back the insertelement chain + SmallVector Chain; + Value *CurrentVal = FinalInsert; + + while (auto *IE = dyn_cast(CurrentVal)) { + Chain.push_back(IE); + CurrentVal = IE->getOperand(0); // Vector being inserted into + } + + // Reverse to get forward order + std::reverse(Chain.begin(), Chain.end()); + Pattern.Chain = Chain; + + // Extract base value (the initial vector) + Pattern.BaseValue = extractBaseValue(Pattern); + + // Analyze which fields are constant vs variable + unsigned NumElements = + cast(Pattern.DescType)->getNumElements(); + SmallBitVector FieldSet(NumElements, false); + + for (auto *IE : Chain) { + if (auto *CI = dyn_cast(IE->getOperand(2))) { + unsigned Idx = CI->getZExtValue(); + if (Idx < NumElements) { + FieldSet.set(Idx); + Pattern.VariableFields.push_back(Idx); + } + } + } + + // Fields not in chain are constant + for (unsigned i = 0; i < NumElements; ++i) { + if (!FieldSet[i]) + Pattern.ConstantFields.push_back(i); + } + + return Pattern; +} + +Value * +AMDGPUTDMOptimization::extractBaseValue(const DescriptorPattern &Pattern) { + if (Pattern.Chain.empty()) + return nullptr; + + // Get the vector being inserted into by the first insert + Value *Base = Pattern.Chain[0]->getOperand(0); + + // If base is a constant vector or another recognizable pattern, return it + if (isa(Base)) + return Base; + + // For shufflevector results, we might want to trace further back + if (auto *SV = dyn_cast(Base)) + return SV; // Keep shufflevector as base for now + + return Base; +} + +Loop *AMDGPUTDMOptimization::getContainingLoop(BasicBlock *BB) { + return LI ? LI->getLoopFor(BB) : nullptr; +} + +//===----------------------------------------------------------------------===// +// Pattern Grouping +//===----------------------------------------------------------------------===// + +void AMDGPUTDMOptimization::groupSimilarPatterns() { + // Simple grouping strategy: group by type and base similarity + for (auto &Pattern : DetectedPatterns) { + bool Added = false; + + // Try to add to existing group + for (auto &Group : OptimizationGroups) { + if (Group.SharedType == Pattern.DescType && + arePatternsSimilar(Group.Patterns[0], Pattern)) { + Group.Patterns.push_back(Pattern); + Added = true; + break; + } + } + + // Create new group if needed + if (!Added) { + DescriptorGroup NewGroup; + NewGroup.SharedType = Pattern.DescType; + NewGroup.SharedBase = Pattern.BaseValue; + NewGroup.Patterns.push_back(Pattern); + OptimizationGroups.push_back(std::move(NewGroup)); + } + } + + // Remove groups that don't meet optimization criteria + OptimizationGroups.erase( + std::remove_if(OptimizationGroups.begin(), OptimizationGroups.end(), + [](const DescriptorGroup &Group) { + return Group.getOptimizationBenefit() < + TDMOptBenefitThreshold; + }), + OptimizationGroups.end()); +} + +bool AMDGPUTDMOptimization::arePatternsSimilar(const DescriptorPattern &A, + const DescriptorPattern &B) { + // Patterns are similar if they have same type and similar field usage + if (A.DescType != B.DescType) + return false; + + // Check if constant fields overlap significantly + SmallBitVector AConstants( + cast(A.DescType)->getNumElements()); + SmallBitVector BConstants( + cast(B.DescType)->getNumElements()); + + for (unsigned Field : A.ConstantFields) + AConstants.set(Field); + for (unsigned Field : B.ConstantFields) + BConstants.set(Field); + + // Count overlapping constant fields + auto Intersection = AConstants & BConstants; + unsigned OverlapCount = Intersection.count(); + unsigned TotalConstants = std::max(AConstants.count(), BConstants.count()); + + return TotalConstants > 0 && (float)OverlapCount / TotalConstants >= 0.5f; +} + +//===----------------------------------------------------------------------===// +// Pattern Transformation +//===----------------------------------------------------------------------===// + +bool AMDGPUTDMOptimization::transformPatterns(Function &F) { + bool Changed = false; + + for (auto &Group : OptimizationGroups) { + LLVM_DEBUG(dbgs() << "Transforming group with " << Group.Patterns.size() + << " patterns, benefit = " + << Group.getOptimizationBenefit() << "\n"); + + if (transformDescriptorGroup(Group, F)) + Changed = true; + } + + return Changed; +} + +bool AMDGPUTDMOptimization::transformDescriptorGroup(DescriptorGroup &Group, + Function &F) { + if (Group.Patterns.empty()) + return false; + + // Find the best location to place shared storage + BasicBlock *StorageLocation = Group.Patterns[0].Location; + + // If patterns are in a loop, try to hoist storage outside loop + if (auto *Loop = Group.Patterns[0].ContainingLoop) { + if (auto *Preheader = Loop->getLoopPreheader()) { + StorageLocation = Preheader; + LLVM_DEBUG(dbgs() << "Hoisting storage outside loop\n"); + } + } + + // Create shared storage at the beginning of the storage block + IRBuilder<> Builder(&StorageLocation->front()); + Value *SharedStorage = createSharedStorage(Group, Builder); + + if (!SharedStorage) + return false; + + // Transform each pattern in the group + for (auto &Pattern : Group.Patterns) { + IRBuilder<> PatternBuilder(Pattern.Chain.back()); + transformSinglePattern(Pattern, SharedStorage, PatternBuilder); + } + + return true; +} + +Value *AMDGPUTDMOptimization::createSharedStorage(DescriptorGroup &Group, + IRBuilder<> &Builder) { + // Create alloca in address space 5 (AMDGPU private memory) + auto *StorageType = Group.SharedType; + auto *Storage = Builder.CreateAlloca( + StorageType, /*AddrSpace=*/5, /*ArraySize=*/nullptr, "tdm_desc_storage"); + + // Initialize with base template if available + if (Group.SharedBase) { + auto *BaseConstant = dyn_cast(Group.SharedBase); + if (BaseConstant) { + Builder.CreateStore(BaseConstant, Storage); + LLVM_DEBUG(dbgs() << "Initialized storage with constant base\n"); + } + } + + return Storage; +} + +void AMDGPUTDMOptimization::transformSinglePattern(DescriptorPattern &Pattern, + Value *SharedStorage, + IRBuilder<> &Builder) { + // Create field pointers for variable fields + SmallVector FieldPointers; + for (unsigned FieldIdx : Pattern.VariableFields) { + Value *FieldPtr = + Builder.CreateGEP(Pattern.DescType, SharedStorage, + {Builder.getInt32(0), Builder.getInt32(FieldIdx)}, + "tdm_field_" + Twine(FieldIdx) + "_ptr"); + FieldPointers.push_back(FieldPtr); + } + + // Update variable fields with values from the original chain + for (unsigned i = 0; + i < Pattern.VariableFields.size() && i < Pattern.Chain.size(); ++i) { + auto *InsertInst = Pattern.Chain[i]; + Value *NewValue = InsertInst->getOperand(1); // Value being inserted + Builder.CreateStore(NewValue, FieldPointers[i]); + } + + // Replace final result with load from shared storage + Value *OptimizedDescriptor = + Builder.CreateLoad(Pattern.DescType, SharedStorage, "tdm_optimized_desc"); + + // Replace all uses of the final insert with the load + Pattern.Chain.back()->replaceAllUsesWith(OptimizedDescriptor); + + // Let DCE (Dead Code Elimination) clean up the now-unused insertelement + // chains The instructions should be dead after replaceAllUsesWith above + + LLVM_DEBUG(dbgs() << "Transformed pattern with " + << Pattern.VariableFields.size() << " variable fields\n"); +} + +} // end anonymous namespace + +char AMDGPUTDMOptimization::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUTDMOptimization, DEBUG_TYPE, + "AMDGPU TDM Descriptor Optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUTDMOptimization, DEBUG_TYPE, + "AMDGPU TDM Descriptor Optimization", false, false) + +namespace llvm { +FunctionPass *createAMDGPUTDMOptimizationPass() { + return new AMDGPUTDMOptimization(); +} +} // namespace llvm \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 309e92a2ee88e..fe8dcc6df3367 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -129,6 +129,10 @@ using namespace llvm; using namespace llvm::PatternMatch; +static cl::opt EnableTDMOptimization( + "amdgpu-enable-tdm-opt", cl::Hidden, cl::init(true), + cl::desc("Enable AMDGPU TDM descriptor optimization")); + namespace { //===----------------------------------------------------------------------===// // AMDGPU CodeGen Pass Builder interface. @@ -593,6 +597,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); initializeAMDGPULowerIntrinsicsLegacyPass(*PR); + initializeAMDGPUTDMOptimizationPass(*PR); initializeAMDGPUReserveWWMRegsLegacyPass(*PR); initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -1413,6 +1418,12 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + // Add TDM Descriptor Optimization + SROA sequence + if (EnableTDMOptimization) { + addPass(createAMDGPUTDMOptimizationPass()); // Create alloca patterns + addPass(createSROAPass()); // Convert to INSERT_SUBREG + } + TargetPassConfig::addCodeGenPrepare(); if (isPassEnabled(EnableLoadStoreVectorizer)) diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 782cbfa76e6e9..4253257974fb8 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -117,6 +117,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp AMDGPUTargetTransformInfo.cpp + AMDGPUTDMOptimization.cpp AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp R600MachineCFGStructurizer.cpp diff --git a/llvm/test/CodeGen/AMDGPU/tdm-optimization.ll b/llvm/test/CodeGen/AMDGPU/tdm-optimization.ll new file mode 100644 index 0000000000000..491ac1fa9a6f9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tdm-optimization.ll @@ -0,0 +1,485 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +; Check the complete optimization pipeline: TDM pass -> SROA -> MIR backend +; Stage 1: TDM optimization creates alloca patterns +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=2 -stop-after=amdgpu-tdm-optimization < %s | FileCheck %s --check-prefix=TDM-PASS + +; Stage 2: SROA converts allocas to SSA form (phi nodes, extract/insertelement) +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=2 -stop-after=sroa < %s | FileCheck %s --check-prefix=SROA-PASS + +; Stage 3: MIR backend generates INSERT_SUBREG instead of REG_SEQUENCE +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=2 -stop-after=amdgpu-isel < %s | FileCheck %s --check-prefix=MIR-PASS + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; Declare the AMDGPU intrinsics with proper signatures from tdm.ll +declare void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32>, <8 x i32>, i32 immarg) #5 + +attributes #5 = { convergent nocallback nofree nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) } + +;===----------------------------------------------------------------------===; +; Pattern 1: Basic Address Descriptor Chain Optimization +;===----------------------------------------------------------------------===; + +; Test 1a: Basic address descriptor with base reuse (generates INSERT_SUBREG) +define amdgpu_kernel void @test_basic_address_descriptor(i32 %val1, i32 %val2, i32 %val3, i32 %val4, i32 %val5, i32 %val6) { +; Stage 1: TDM optimization creates alloca + field update pattern (old insertelement chains remain until DCE) +; TDM-PASS-LABEL: @test_basic_address_descriptor( +; TDM-PASS-DAG: %[[TDM_DESC_STORAGE:.*]] = alloca <4 x i32>, align 16, addrspace(5) +; TDM-PASS-DAG: store <4 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]] +; TDM-PASS-DAG: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; TDM-PASS-DAG: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; TDM-PASS-DAG: %[[TDM_FIELD_3_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 3 +; TDM-PASS-DAG: store i32 {{.*}}, ptr addrspace(5) %[[TDM_FIELD_1_PTR]] +; TDM-PASS-DAG: store i32 {{.*}}, ptr addrspace(5) %[[TDM_FIELD_2_PTR]] +; TDM-PASS-DAG: store i32 {{.*}}, ptr addrspace(5) %[[TDM_FIELD_3_PTR]] +; TDM-PASS-DAG: %[[TDM_DESC_LOAD:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]] + +; Stage 2: SROA converts alloca to optimized SSA form (base reuse pattern) +; SROA-PASS-LABEL: @test_basic_address_descriptor( +; SROA-PASS-NOT: alloca +; First descriptor chain - SROA creates incremental vector construction with template base: +; SROA-PASS: %[[VEC1:.*]].vec.insert = insertelement <4 x i32> , i32 %[[VAL1:.*]], i32 1 +; SROA-PASS: %[[VEC2:.*]].vec.insert = insertelement <4 x i32> %[[VEC1]].vec.insert, i32 %[[VAL2:.*]], i32 2 +; SROA-PASS: %[[VEC3:.*]].vec.insert = insertelement <4 x i32> %[[VEC2]].vec.insert, i32 %[[VAL3:.*]], i32 3 +; SROA-PASS: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[VEC3]].vec.insert, <8 x i32> zeroinitializer, i32 0) +; Second descriptor chain - reuses first descriptor as base (key optimization!): +; SROA-PASS: %[[VEC4:.*]].vec.insert{{[0-9]*}} = insertelement <4 x i32> %[[VEC3]].vec.insert, i32 %[[VAL4:.*]], i32 1 +; SROA-PASS: %[[VEC5:.*]].vec.insert{{[0-9]*}} = insertelement <4 x i32> %[[VEC4]].vec.insert{{[0-9]*}}, i32 %[[VAL5:.*]], i32 2 +; SROA-PASS: %[[VEC6:.*]].vec.insert{{[0-9]*}} = insertelement <4 x i32> %[[VEC5]].vec.insert{{[0-9]*}}, i32 %[[VAL6:.*]], i32 3 +; SROA-PASS: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[VEC6]].vec.insert{{[0-9]*}}, <8 x i32> zeroinitializer, i32 0) + +; Stage 3: MIR backend uses INSERT_SUBREG for base reuse (our optimization working!) +; MIR-PASS-LABEL: name: test_basic_address_descriptor +; MIR-PASS: %[[BASE:[0-9]+]]:{{sgpr_128|sreg_128}} = REG_SEQUENCE +; MIR-PASS: TENSOR_LOAD_TO_LDS_D2 %[[BASE]] +; MIR-PASS: %[[REG1:[0-9]+]]:{{sgpr_128|sreg_128}} = INSERT_SUBREG %[[BASE]], {{.*}}, %subreg.sub1 +; MIR-PASS: %[[REG2:[0-9]+]]:{{sgpr_128|sreg_128}} = INSERT_SUBREG %[[REG1]], {{.*}}, %subreg.sub2 +; MIR-PASS: %[[REG3:[0-9]+]]:{{sgpr_128|sreg_128}} = INSERT_SUBREG %[[REG2]], {{.*}}, %subreg.sub3 +; MIR-PASS: TENSOR_LOAD_TO_LDS_D2 {{.*}}[[REG3]] +entry: + ; Create base descriptor that will be reused (similar to loop pattern in real code) + %base_desc = insertelement <4 x i32> , i32 0, i64 0 + + ; First descriptor: update fields 1,2,3 from base + %desc1_1 = insertelement <4 x i32> %base_desc, i32 %val1, i64 1 + %desc1_2 = insertelement <4 x i32> %desc1_1, i32 %val2, i64 2 + %desc1_final = insertelement <4 x i32> %desc1_2, i32 %val3, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc1_final, <8 x i32> zeroinitializer, i32 0) + + ; Second descriptor: reuse same base, update with different values (this should generate INSERT_SUBREG) + %desc2_1 = insertelement <4 x i32> %base_desc, i32 %val4, i64 1 + %desc2_2 = insertelement <4 x i32> %desc2_1, i32 %val5, i64 2 + %desc2_final = insertelement <4 x i32> %desc2_2, i32 %val6, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc2_final, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 1b: Multiple address descriptors with same base - should share storage +define amdgpu_kernel void @test_multiple_address_descriptors(i32 %val1a, i32 %val2a, i32 %val3a, i32 %val1b, i32 %val2b, i32 %val3b) { +; CHECK-LABEL: @test_multiple_address_descriptors( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <4 x i32>, align 16, addrspace(5) +; CHECK-NEXT: store <4 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: %[[TDM_FIELD_3_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 3 +; CHECK-NEXT: store i32 %[[VAL1A:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2A:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL3A:.*]], ptr addrspace(5) %[[TDM_FIELD_3_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[TDM_OPTIMIZED_DESC]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: store i32 %[[VAL1B:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2B:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL3B:.*]], ptr addrspace(5) %[[TDM_FIELD_3_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC1:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[TDM_OPTIMIZED_DESC1]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; +entry: + ; First descriptor + %insert1a = insertelement <4 x i32> , i32 %val1a, i64 1 + %insert2a = insertelement <4 x i32> %insert1a, i32 %val2a, i64 2 + %insert3a = insertelement <4 x i32> %insert2a, i32 %val3a, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert3a, <8 x i32> zeroinitializer, i32 0) + + ; Second descriptor with same base - should reuse storage + %insert1b = insertelement <4 x i32> , i32 %val1b, i64 1 + %insert2b = insertelement <4 x i32> %insert1b, i32 %val2b, i64 2 + %insert3b = insertelement <4 x i32> %insert2b, i32 %val3b, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert3b, <8 x i32> zeroinitializer, i32 0) + ret void +} + +;===----------------------------------------------------------------------===; +; Pattern 2: Complex Tensor Descriptor Construction +; From tdm.ll analysis: 3 instances, lines 129-131, 523-525, 709-711 +;===----------------------------------------------------------------------===; + +; Test 2a: Tensor descriptor with insertelement + shufflevector pattern +define amdgpu_kernel void @test_complex_tensor_descriptor(i32 %val1, i32 %val2) { +; CHECK-LABEL: @test_complex_tensor_descriptor( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <8 x i32>, align 32, addrspace(5) +; CHECK-NEXT: store <8 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 32 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: store i32 %[[VAL1:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> %[[TDM_OPTIMIZED_DESC]], i32 0) +; CHECK-NEXT: ret void +; +entry: + ; Pattern from tdm.ll: complex construction with shufflevector + ; %125 = insertelement <8 x i32> , i32 %val1, i64 1 + ; %126 = shufflevector ... mask <0,1,poison,11,12,13,14,15> + ; %127 = insertelement <8 x i32> %126, i32 %val2, i64 2 + + %step1 = insertelement <8 x i32> , i32 %val1, i64 1 + %step2 = shufflevector <8 x i32> %step1, <8 x i32> , <8 x i32> + %final = insertelement <8 x i32> %step2, i32 %val2, i64 2 + + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> %final, i32 0) + ret void +} + +; Test 2b: Tensor descriptor with template B (4194304, 32, 128, 0, 0) +define amdgpu_kernel void @test_tensor_descriptor_template_b(i32 %val1, i32 %val2) { +; CHECK-LABEL: @test_tensor_descriptor_template_b( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <8 x i32>, align 32, addrspace(5) +; CHECK-NEXT: store <8 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 32 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: store i32 %[[VAL1:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> %[[TDM_OPTIMIZED_DESC]], i32 0) +; CHECK-NEXT: ret void +; +entry: + ; Template B pattern from tdm.ll (used 2x) + %step1 = insertelement <8 x i32> , i32 %val1, i64 1 + %step2 = shufflevector <8 x i32> %step1, <8 x i32> , <8 x i32> + %final = insertelement <8 x i32> %step2, i32 %val2, i64 2 + + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> %final, i32 0) + ret void +} + +;===----------------------------------------------------------------------===; +; Pattern 3: Cross-Iteration Loop Reuse (HIGHEST IMPACT) +; From tdm.ll analysis: 3,700+ line loop with descriptor reuse +;===----------------------------------------------------------------------===; + +; Test 3: Loop-based descriptor creation with cross-iteration reuse +define amdgpu_kernel void @test_loop_descriptor_reuse(i32 %base_val) { +; CHECK-LABEL: @test_loop_descriptor_reuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <4 x i32>, align 16, addrspace(5) +; CHECK-NEXT: store <4 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: %[[TDM_FIELD_3_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 3 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: loop: +; CHECK-NEXT: %[[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:.*]], %[[LOOP]] ] +; CHECK-NEXT: %[[VAL1:.*]] = add i32 %[[I]], 100 +; CHECK-NEXT: %[[VAL2:.*]] = add i32 %[[I]], 200 +; CHECK-NEXT: %[[VAL3:%.*]] = add i32 %[[I]], [[BASE_VAL:.*]] +; CHECK-NEXT: store i32 %[[VAL1]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL3]], ptr addrspace(5) %[[TDM_FIELD_3_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[TDM_OPTIMIZED_DESC]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: %[[I_NEXT]] = add i32 %[[I]], 1 +; CHECK-NEXT: %[[COND:.*]] = icmp ult i32 %[[I_NEXT]], 10 +; CHECK-NEXT: br i1 %[[COND]], label %[[LOOP]], label [[EXIT:.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %i = phi i32 [0, %entry], [%i.next, %loop] + + ; Compute dynamic values + %val1 = add i32 %i, 100 + %val2 = add i32 %i, 200 + %val3 = add i32 %i, %base_val + + ; Create descriptor each iteration (inefficient - same base, different fields) + %insert1 = insertelement <4 x i32> , i32 %val1, i64 1 + %insert2 = insertelement <4 x i32> %insert1, i32 %val2, i64 2 + %insert3 = insertelement <4 x i32> %insert2, i32 %val3, i64 3 + + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert3, <8 x i32> zeroinitializer, i32 0) + + %i.next = add i32 %i, 1 + %cond = icmp ult i32 %i.next, 10 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +;===----------------------------------------------------------------------===; +; Pattern 4: Loop-Invariant Tensor Base Reuse +; From tdm.ll analysis: Lines 1732, 1774, 1780 - tensor bases created outside loop +;===----------------------------------------------------------------------===; + +; Test 4: Tensor descriptor with loop-invariant base, only field 2 changes +define amdgpu_kernel void @test_tensor_base_reuse(i32 %start_val) { +; CHECK-LABEL: @test_tensor_base_reuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <8 x i32>, align 32, addrspace(5) +; CHECK-NEXT: store <8 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 32 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: loop: +; CHECK-NEXT: %[[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:.*]], %[[LOOP]] ] +; CHECK-NEXT: %[[FIELD2_VAL:%.*]] = add i32 %[[I]], [[START_VAL:.*]] +; CHECK-NEXT: store i32 %[[FIELD2_VAL]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <8 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> %[[TDM_OPTIMIZED_DESC]], i32 0) +; CHECK-NEXT: %[[I_NEXT]] = add i32 %[[I]], 1 +; CHECK-NEXT: %[[COND:.*]] = icmp ult i32 %[[I_NEXT]], 5 +; CHECK-NEXT: br i1 %[[COND]], label %[[LOOP]], label [[EXIT:.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + ; Create loop-invariant tensor base outside loop (optimal placement) + %tensor_base = shufflevector <8 x i32> , <8 x i32> , <8 x i32> + br label %loop + +loop: + %i = phi i32 [0, %entry], [%i.next, %loop] + + ; Only field 2 changes each iteration (high reuse potential!) + %field2_val = add i32 %i, %start_val + %descriptor = insertelement <8 x i32> %tensor_base, i32 %field2_val, i64 2 + + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> %descriptor, i32 0) + + %i.next = add i32 %i, 1 + %cond = icmp ult i32 %i.next, 5 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +;===----------------------------------------------------------------------===; +; Negative Tests: Patterns that should NOT be optimized +;===----------------------------------------------------------------------===; + +; Test 5a: Low field reuse - should not optimize +define amdgpu_kernel void @test_no_opt_low_reuse(i32 %val1, i32 %val2, i32 %val3, i32 %val4) { +; CHECK-LABEL: @test_no_opt_low_reuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[INSERT1:%.*]] = insertelement <4 x i32> , i32 [[VAL1:.*]], i64 0 +; CHECK-NEXT: %[[INSERT2:%.*]] = insertelement <4 x i32> %[[INSERT1]], i32 [[VAL2:.*]], i64 1 +; CHECK-NEXT: %[[INSERT3:%.*]] = insertelement <4 x i32> %[[INSERT2]], i32 [[VAL3:.*]], i64 2 +; CHECK-NEXT: %[[INSERT4:%.*]] = insertelement <4 x i32> %[[INSERT3]], i32 [[VAL4:.*]], i64 3 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[INSERT4]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; +entry: + ; All fields variable, no constants - low benefit, should not optimize + %insert1 = insertelement <4 x i32> , i32 %val1, i64 0 + %insert2 = insertelement <4 x i32> %insert1, i32 %val2, i64 1 + %insert3 = insertelement <4 x i32> %insert2, i32 %val3, i64 2 + %insert4 = insertelement <4 x i32> %insert3, i32 %val4, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert4, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 5b: Single use descriptor - borderline case +define amdgpu_kernel void @test_no_opt_single_use(i32 %val1, i32 %val2) { +; CHECK-LABEL: @test_no_opt_single_use( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[INSERT1:%.*]] = insertelement <4 x i32> , i32 [[VAL1:.*]], i64 2 +; CHECK-NEXT: %[[INSERT2:%.*]] = insertelement <4 x i32> %[[INSERT1]], i32 [[VAL2:.*]], i64 3 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[INSERT2]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; +entry: + ; Single use with good field reuse - might optimize for address descriptors + %insert1 = insertelement <4 x i32> , i32 %val1, i64 2 + %insert2 = insertelement <4 x i32> %insert1, i32 %val2, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert2, <8 x i32> zeroinitializer, i32 0) + ret void +} + +;===----------------------------------------------------------------------===; +; Real-world patterns from tdm.ll analysis +;===----------------------------------------------------------------------===; + +; Test 6: Exact pattern from tdm.ll lines 2986-2988 (inside loop) +define amdgpu_kernel void @test_real_tdm_pattern(i32 %computed_val1, i32 %computed_val2, i32 %computed_val3) { +; CHECK-LABEL: @test_real_tdm_pattern( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <4 x i32>, align 16, addrspace(5) +; CHECK-NEXT: store <4 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: %[[TDM_FIELD_3_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 3 +; CHECK-NEXT: store i32 %[[COMPUTED_VAL1:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[COMPUTED_VAL2:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: store i32 %[[COMPUTED_VAL3:.*]], ptr addrspace(5) %[[TDM_FIELD_3_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[TDM_OPTIMIZED_DESC]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; +entry: + ; Exact pattern from tdm.ll (high-impact case) + %2395 = insertelement <4 x i32> , i32 %computed_val1, i64 1 + %2396 = insertelement <4 x i32> %2395, i32 %computed_val2, i64 2 + %2397 = insertelement <4 x i32> %2396, i32 %computed_val3, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %2397, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 7: Template normalization - different poison/undef patterns should be unified +define amdgpu_kernel void @test_template_normalization_a(i32 %val1, i32 %val2, i32 %val3) { +; CHECK-LABEL: @test_template_normalization_a( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <4 x i32>, align 16, addrspace(5) +; CHECK-NEXT: store <4 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: %[[TDM_FIELD_3_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 3 +; CHECK-NEXT: store i32 %[[VAL1:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL3:.*]], ptr addrspace(5) %[[TDM_FIELD_3_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[TDM_OPTIMIZED_DESC]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; +entry: + ; Variation A: + %desc = insertelement <4 x i32> , i32 %val1, i64 1 + %desc2 = insertelement <4 x i32> %desc, i32 %val2, i64 2 + %desc3 = insertelement <4 x i32> %desc2, i32 %val3, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc3, <8 x i32> zeroinitializer, i32 0) + ret void +} + +define amdgpu_kernel void @test_template_normalization_b(i32 %val1, i32 %val2, i32 %val3) { +; CHECK-LABEL: @test_template_normalization_b( +; CHECK-NEXT: entry: +; CHECK-NEXT: %[[TDM_DESC_STORAGE:.*]] = alloca <4 x i32>, align 16, addrspace(5) +; CHECK-NEXT: store <4 x i32> , ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: %[[TDM_FIELD_1_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 1 +; CHECK-NEXT: %[[TDM_FIELD_2_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 2 +; CHECK-NEXT: %[[TDM_FIELD_3_PTR:.*]] = getelementptr <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], i32 0, i32 3 +; CHECK-NEXT: store i32 %[[VAL1:.*]], ptr addrspace(5) %[[TDM_FIELD_1_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL2:.*]], ptr addrspace(5) %[[TDM_FIELD_2_PTR]], align 4 +; CHECK-NEXT: store i32 %[[VAL3:.*]], ptr addrspace(5) %[[TDM_FIELD_3_PTR]], align 4 +; CHECK-NEXT: %[[TDM_OPTIMIZED_DESC:.*]] = load <4 x i32>, ptr addrspace(5) %[[TDM_DESC_STORAGE]], align 16 +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %[[TDM_OPTIMIZED_DESC]], <8 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; +entry: + ; Variation B: - should normalize to same template + %desc = insertelement <4 x i32> , i32 %val1, i64 1 + %desc2 = insertelement <4 x i32> %desc, i32 %val2, i64 2 + %desc3 = insertelement <4 x i32> %desc2, i32 %val3, i64 3 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc3, <8 x i32> zeroinitializer, i32 0) + ret void +} + +;===----------------------------------------------------------------------===; +; Threshold Testing: Verify different benefit scores work with threshold +;===----------------------------------------------------------------------===; + +; Test 1: Low benefit (should not optimize with default threshold=10) +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=10 -stop-after=amdgpu-tdm-optimization < %s | FileCheck %s --check-prefix=THRESHOLD10-LOW +define amdgpu_kernel void @test_threshold_low_benefit(i32 %val1) { +; THRESHOLD10-LOW-LABEL: @test_threshold_low_benefit( +; THRESHOLD10-LOW: %insert1 = insertelement <4 x i32> , i32 %val1.load, i64 1 +; THRESHOLD10-LOW-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert1, <8 x i32> zeroinitializer, i32 0) +entry: + ; Benefit = 1 constant field * 2 = 2 (below threshold=10) + %insert1 = insertelement <4 x i32> , i32 %val1, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert1, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 2: Low benefit (should optimize with lower threshold=2) +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=2 -stop-after=amdgpu-tdm-optimization < %s | FileCheck %s --check-prefix=THRESHOLD2-LOW +define amdgpu_kernel void @test_threshold_low_benefit_enabled(i32 %val1) { +; THRESHOLD2-LOW-LABEL: @test_threshold_low_benefit_enabled( +; THRESHOLD2-LOW: alloca <4 x i32> +; THRESHOLD2-LOW: store <4 x i32> +; THRESHOLD2-LOW: getelementptr <4 x i32>{{.*}}i32 1 +; THRESHOLD2-LOW: store i32 %val1.load +; THRESHOLD2-LOW: load <4 x i32> +entry: + ; Benefit = 1 constant field * 2 = 2 (meets threshold=2) + %insert1 = insertelement <4 x i32> , i32 %val1, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %insert1, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 3: Medium benefit (should optimize with default threshold=10) +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=10 -stop-after=amdgpu-tdm-optimization < %s | FileCheck %s --check-prefix=THRESHOLD10-MED +define amdgpu_kernel void @test_threshold_medium_benefit(i32 %val1, i32 %val2) { +; THRESHOLD10-MED-LABEL: @test_threshold_medium_benefit( +; THRESHOLD10-MED: alloca <4 x i32> +; THRESHOLD10-MED: store <4 x i32> +entry: + ; Group of 2 patterns with 3 constant fields each = 3*2*2 = 12 benefit (above threshold=10) + %desc1 = insertelement <4 x i32> , i32 %val1, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc1, <8 x i32> zeroinitializer, i32 0) + + %desc2 = insertelement <4 x i32> , i32 %val2, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc2, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 4: High threshold blocks even good patterns +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=50 -stop-after=amdgpu-tdm-optimization < %s | FileCheck %s --check-prefix=THRESHOLD50-HIGH +define amdgpu_kernel void @test_threshold_high_blocks_opt(i32 %val1, i32 %val2) { +; THRESHOLD50-HIGH-LABEL: @test_threshold_high_blocks_opt( +; THRESHOLD50-HIGH: insertelement +; THRESHOLD50-HIGH-NOT: alloca <4 x i32> +entry: + ; Same pattern as above but benefit=12 is below threshold=50 + %desc1 = insertelement <4 x i32> , i32 %val1, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc1, <8 x i32> zeroinitializer, i32 0) + + %desc2 = insertelement <4 x i32> , i32 %val2, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc2, <8 x i32> zeroinitializer, i32 0) + ret void +} + +; Test 5: Loop pattern with threshold (loop multiplier = 5x) +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-tdm-opt-threshold=8 -stop-after=amdgpu-tdm-optimization < %s | FileCheck %s --check-prefix=THRESHOLD8-LOOP +define amdgpu_kernel void @test_threshold_loop_multiplier(i32* %data, i32 %count) { +; THRESHOLD8-LOOP-LABEL: @test_threshold_loop_multiplier( +; THRESHOLD8-LOOP: alloca <4 x i32> +; THRESHOLD8-LOOP: store <4 x i32> +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %next, %loop ] + %ptr = getelementptr i32, i32* %data, i32 %i + %val = load i32, i32* %ptr + + ; Benefit = 1 pattern * 3 constant fields * 2 * 5 (loop multiplier) = 30 (above threshold=8) + %desc = insertelement <4 x i32> , i32 %val, i64 1 + call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %desc, <8 x i32> zeroinitializer, i32 0) + + %next = add i32 %i, 1 + %cond = icmp ult i32 %next, %count + br i1 %cond, label %loop, label %exit + +exit: + ret void +}