diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml index 407b41802eea7..2be059422043a 100644 --- a/.azuredevops/rocm-ci.yml +++ b/.azuredevops/rocm-ci.yml @@ -23,5 +23,7 @@ trigger: - '**/*.md' - LICENSE.TXT +pr: none + jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/llvm-project.yml@pipelines_repo diff --git a/.github/workflows/PSDB-amd-mainline.yml b/.github/workflows/PSDB-amd-mainline.yml new file mode 100644 index 0000000000000..35a3095419194 --- /dev/null +++ b/.github/workflows/PSDB-amd-mainline.yml @@ -0,0 +1,106 @@ +name: Compiler CI PSDB trigger on amd-mainline branch + +# Controls when the workflow will run +on: + pull_request: + branches: [amd-mainline] + types: [opened, reopened, synchronize, ready_for_review] + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel, below is a single job called invoke jenkins jobs +jobs: + # This workflow contains a single job called "invoke_jenkins_PSDB" + invoke_jenkins_PSDB: + if: github.event.pull_request.draft == false + runs-on: + group: compiler-generic-runners + env: + svc_acc_org_secret: ${{secrets.CI_GITHUB_TOKEN}} + input_sha: ${{ github.event.pull_request.head.sha != '' && github.event.pull_request.head.sha || github.sha }} + input_pr_num: ${{ github.event.pull_request.number != '' && github.event.pull_request.number || 0 }} + input_pr_url: ${{ github.event.pull_request.html_url != '' && github.event.pull_request.html_url || '' }} + input_pr_title: ${{ github.event.pull_request.title != '' && github.event.pull_request.title || '' }} + # set the pipeline name here based on branch name + pipeline_name: ${{secrets.CI_JENKINS_MAINLINE_JOB_NAME}} + JENKINS_URL: ${{secrets.CI_JENKINS_URL}} + CONTAINER_IMAGE: ${{ secrets.JENKINS_TRIGGER_DOCKER_IMAGE }} + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - name: Set environment variable for container image + run: | + echo "CONTAINER_IMAGE=${{ secrets.JENKINS_TRIGGER_DOCKER_IMAGE }}" >> $GITHUB_ENV + echo "CONTAINER_NAME=my_container_${{ github.run_id }}" >> $GITHUB_ENV + + + - name: Pull container image + run: docker pull "${{env.CONTAINER_IMAGE}}" + + + - name: Run container + run: | + docker run -d --name "${{env.CONTAINER_NAME}}" $CONTAINER_IMAGE sleep infinity + #docker exec "${{env.CONTAINER_NAME}}" /bin/bash -c "git clone ${{secrets.CI_UTILS_REPO}} ." + docker exec "${{env.CONTAINER_NAME}}" /bin/bash -c "echo 'Running commands inside the container'" + + - name: Escape pull request title + run: | + import json + import os + import shlex + with open('${{ github.event_path }}') as fh: + event = json.load(fh) + escaped = event['pull_request']['title'] + with open(os.environ['GITHUB_ENV'], 'a') as fh: + print(f'PR_TITLE={escaped}', file=fh) + shell: python3 {0} + + - name: Run Jenkins Cancel Script + env: + JENKINS_URL: ${{secrets.CI_JENKINS_URL}} + JENKINS_USER: ${{secrets.CI_JENKINS_USER}} + JENKINS_API_TOKEN: ${{secrets.CI_JENKINS_TOKEN}} + JENKINS_JOB_NAME: ${{secrets.CI_JENKINS_JOB_NAME}} + PR_NUMBER: ${{ github.event.pull_request.number }} + COMMIT_HASH: ${{ github.event.after }} + run: | + docker exec -e JENKINS_JOB_NAME=${{secrets.CI_JENKINS_JOB_NAME}} -e PR_NUMBER=${{ github.event.pull_request.number }} -e COMMIT_HASH=${{ github.event.after }} -e JENKINS_URL=${{secrets.CI_JENKINS_URL}} -e JENKINS_USER=${{secrets.CI_JENKINS_USER}} -e JENKINS_API_TOKEN=${{secrets.CI_JENKINS_TOKEN}} "${{env.CONTAINER_NAME}}" /bin/bash -c "PYTHONHTTPSVERIFY=0 python3 cancel_previous_build.py" + + + # Runs a set of commands using the runners shell + - name: Getting Event Details + run: | + echo $(pwd) + echo $GITHUB_ENV + echo $GITHUB_REPOSITORY + echo $GITHUB_SERVER_URL + echo "GITHUB_SHA is: $GITHUB_SHA" + echo "GITHUB_WORKFLOW_SHA is: $GITHUB_WORKFLOW_SHA" + echo "GITHUB_BASE_REF is: $GITHUB_BASE_REF" + echo "GITHUB_REF_NAME is: $GITHUB_REF_NAME" + echo "github.event.pull_request.id is: ${{github.event.pull_request.id}}" + echo "github.event.pull_request.html_url is: ${{github.event.pull_request.html_url}}" + echo "github.event.pull_request.number is: ${{github.event.pull_request.number}}" + echo "github.event.pull_request.url is: ${{github.event.pull_request.url}}" + echo "github.event.pull_request.issue_url is: ${{github.event.pull_request.issue_url}}" + echo "github.event.pull_request.head.sha is: ${{github.event.pull_request.head.sha}}" + echo "github.event.pull_request.base.ref is: ${{github.event.pull_request.base.ref}}" + echo "github.event.pull_request.merge_commit_sha is: ${{github.event.pull_request.merge_commit_sha}}" + echo "github.event.pull_request is: ${{github.event.pull_request}}" + + + - name: Trigger Jenkins Pipeline + if: steps.check_changes.outcome != 'failure' + run: | + echo "--Running jenkins_api.py with input sha - $input_sha for pull request - $input_pr_url" + docker exec -e GITHUB_REPOSITORY="$GITHUB_REPOSITORY" -e svc_acc_org_secret="$svc_acc_org_secret" -e input_sha="$input_sha" -e input_pr_url="$input_pr_url" -e pipeline_name="$pipeline_name" \ + -e input_pr_num="$input_pr_num" -e PR_TITLE="$PR_TITLE" -e JENKINS_URL="$JENKINS_URL" -e GITHUB_PAT="$svc_acc_org_secret" "${{env.CONTAINER_NAME}}" \ + /bin/bash -c 'echo \"PR NUM: "$input_pr_num"\" && PYTHONHTTPSVERIFY=0 python3 jenkins_api.py -s \"${JENKINS_URL}\" -jn "$pipeline_name" -ghr "$GITHUB_REPOSITORY" -ghsha "$input_sha" -ghprn "$input_pr_num" -ghpru "$input_pr_url" -ghprt "$PR_TITLE" -ghpat="$svc_acc_org_secret"' + + - name: Stop and remove container + if: always() + run: | + docker stop "${{env.CONTAINER_NAME}}" + docker rm "${{env.CONTAINER_NAME}}" diff --git a/amd/comgr/src/comgr-compiler.cpp b/amd/comgr/src/comgr-compiler.cpp index 5e69d9b9cfab5..84586de078f9f 100644 --- a/amd/comgr/src/comgr-compiler.cpp +++ b/amd/comgr/src/comgr-compiler.cpp @@ -810,16 +810,11 @@ amd_comgr_status_t AMDGPUCompiler::createTmpDirs() { return AMD_COMGR_STATUS_SUCCESS; } -amd_comgr_status_t AMDGPUCompiler::removeTmpDirs() { - if (TmpDir.empty()) { - return AMD_COMGR_STATUS_SUCCESS; - } - ProfilePoint Point("RemoveDir"); - -#ifdef _WIN32 // On windows fs::remove_directories takes huge time so use fs::remove. +#ifdef _WIN32 +amd_comgr_status_t removeDirectory(const StringRef DirName) { std::error_code EC; - for (fs::directory_iterator Dir(TmpDir, EC), DirEnd; Dir != DirEnd && !EC; + for (fs::directory_iterator Dir(DirName, EC), DirEnd; Dir != DirEnd && !EC; Dir.increment(EC)) { const StringRef Path = Dir->path(); @@ -849,16 +844,26 @@ amd_comgr_status_t AMDGPUCompiler::removeTmpDirs() { } } - if (fs::remove(TmpDir)) { + if (fs::remove(DirName)) { return AMD_COMGR_STATUS_ERROR; } return AMD_COMGR_STATUS_SUCCESS; -#else +} +#endif + +amd_comgr_status_t AMDGPUCompiler::removeTmpDirs() { + if (TmpDir.empty()) { + return AMD_COMGR_STATUS_SUCCESS; + } + ProfilePoint Point("RemoveDir"); +#ifndef _WIN32 if (fs::remove_directories(TmpDir)) { return AMD_COMGR_STATUS_ERROR; } return AMD_COMGR_STATUS_SUCCESS; +#else + return removeDirectory(TmpDir); #endif } diff --git a/amd/comgr/src/comgr.cpp b/amd/comgr/src/comgr.cpp index 9e1d6aba2c8df..6d5fce972e9be 100644 --- a/amd/comgr/src/comgr.cpp +++ b/amd/comgr/src/comgr.cpp @@ -310,7 +310,8 @@ amd_comgr_status_t COMGR::parseTargetIdentifier(StringRef IdentStr, // TODO: Add a LIT test for this - if (IdentStr == "amdgcn-amd-amdhsa--amdgcnspirv") { + if (IdentStr == "amdgcn-amd-amdhsa--amdgcnspirv" || + IdentStr == "amdgcn-amd-amdhsa-unknown-amdgcnspirv") { // Features not supported for SPIR-V if (!Ident.Features.empty()) return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; @@ -2110,6 +2111,8 @@ amd_comgr_populate_name_expression_map(amd_comgr_data_t Data, size_t *Count) { if (!RelaRangeOrError) { llvm::logAllUnhandledErrors(RelaRangeOrError.takeError(), llvm::errs(), "RelaRange creation error: "); + for (auto *Ptr : NameExpDataVec) + delete Ptr; return AMD_COMGR_STATUS_ERROR; } auto RelaRange = std::move(RelaRangeOrError.get()); @@ -2130,6 +2133,8 @@ amd_comgr_populate_name_expression_map(amd_comgr_data_t Data, size_t *Count) { if (!RodataOrError) { llvm::logAllUnhandledErrors(RodataOrError.takeError(), llvm::errs(), "Rodata creation error: "); + for (auto *Ptr : NameExpDataVec) + delete Ptr; return AMD_COMGR_STATUS_ERROR; } auto Rodata = std::move(RodataOrError.get()); @@ -2160,6 +2165,8 @@ amd_comgr_populate_name_expression_map(amd_comgr_data_t Data, size_t *Count) { } } + for (auto *Ptr : NameExpDataVec) + delete Ptr; } // end AMD_COMGR_DATA_KIND_EXECUTABLE conditional *Count = DataP->NameExpressionMap.size(); diff --git a/amd/comgr/test-lit/CMakeLists.txt b/amd/comgr/test-lit/CMakeLists.txt index 993fb967b7df5..73966d7f30722 100644 --- a/amd/comgr/test-lit/CMakeLists.txt +++ b/amd/comgr/test-lit/CMakeLists.txt @@ -21,10 +21,7 @@ if (NOT DEFINED LLVM_LIT_PATH) endif() message("-- LLVM_LIT_PATH: ${LLVM_LIT_PATH}") -# TODO: Re-enable target once nPSDB issue with llvm-lit is fixed -#add_custom_target(test-lit COMMAND "${LLVM_LIT_PATH}" -# "${CMAKE_CURRENT_BINARY_DIR}" -v) -add_custom_target(test-lit COMMAND echo "${LLVM_LIT_PATH}" +add_custom_target(test-lit COMMAND "${LLVM_LIT_PATH}" "${CMAKE_CURRENT_BINARY_DIR}" -v) macro(add_comgr_lit_binary name lang) diff --git a/clang/docs/ClangOffloadBundler.rst b/clang/docs/ClangOffloadBundler.rst index 62cf1642a03a3..5570dbb08ab9a 100644 --- a/clang/docs/ClangOffloadBundler.rst +++ b/clang/docs/ClangOffloadBundler.rst @@ -525,15 +525,15 @@ The compressed offload bundle begins with a header followed by the compressed bi This is a unique identifier to distinguish compressed offload bundles. The value is the string 'CCOB' (Compressed Clang Offload Bundle). - **Version Number (16-bit unsigned int)**: - This denotes the version of the compressed offload bundle format. The current version is `2`. + This denotes the version of the compressed offload bundle format. The current version is `3`. - **Compression Method (16-bit unsigned int)**: This field indicates the compression method used. The value corresponds to either `zlib` or `zstd`, represented as a 16-bit unsigned integer cast from the LLVM compression enumeration. -- **Total File Size (32-bit unsigned int)**: +- **Total File Size (unsigned int, 32-bit in v2, 64-bit in v3)**: This is the total size (in bytes) of the file, including the header. Available in version 2 and above. -- **Uncompressed Binary Size (32-bit unsigned int)**: +- **Uncompressed Binary Size (unsigned int, 32-bit in v2, 64-bit in v3)**: This is the size (in bytes) of the binary data before it was compressed. - **Hash (64-bit unsigned int)**: @@ -542,4 +542,4 @@ The compressed offload bundle begins with a header followed by the compressed bi - **Compressed Data**: The actual compressed binary data follows the header. Its size can be inferred from the total size of the file minus the header size. - > **Note**: Version 3 of the format is under development. It uses 64-bit fields for Total File Size and Uncompressed Binary Size to support files larger than 4GB. To experiment with version 3, set the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=3`. This support is experimental and not recommended for production use. + > **Note**: Version 3 is now the default format. For backward compatibility with older HIP runtimes that support version 2 only, set the environment variable `COMPRESSED_BUNDLE_FORMAT_VERSION=2`. diff --git a/clang/include/clang/Driver/OffloadBundler.h b/clang/include/clang/Driver/OffloadBundler.h index 667156a524b79..e7306ce3cc9ab 100644 --- a/clang/include/clang/Driver/OffloadBundler.h +++ b/clang/include/clang/Driver/OffloadBundler.h @@ -120,7 +120,7 @@ class CompressedOffloadBundle { static llvm::Expected tryParse(llvm::StringRef); }; - static inline const uint16_t DefaultVersion = 2; + static inline const uint16_t DefaultVersion = 3; static llvm::Expected> compress(llvm::compression::Params P, const llvm::MemoryBuffer &Input, diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 14950553ab1c9..c5b84bdaeaf1f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -304,7 +304,7 @@ static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { P->setExternallyInitialized(true); return CGF.Builder.CreateLoad( - RawAddress(P, PTy, CharUnits::One(), KnownNonNull), true); + RawAddress(P, PTy, CharUnits::One(), KnownNonNull)); } Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 24a46eff09a50..b4efc6c3a61ef 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -483,7 +483,23 @@ ParsedClangName ToolChain::getTargetAndModeFromProgramName(StringRef PN) { std::string ProgName = normalizeProgramName(PN); size_t SuffixPos; + bool FlangNew = false; const DriverSuffix *DS = parseDriverSuffix(ProgName, SuffixPos); + + // Part II: Warn if invocation happens with flang-new (for Flang); this is for + // the time being and should be removed once AMD Classic Flang has been + // removed from ROCm. + if (FlangNew) { + // flang-new warning is overwarning, disabling until fixed. + if (false && !::getenv("AMD_NOWARN_FLANG_NEW")) { + // The solution with "llvm::errs()" is not ideal, but the driver object + // is not been constructed yet, so we cannot use the Diag() infrastructure + // for this. + llvm::errs() << "warning: the 'amdflang-new' and 'flang-new' commmands " + "have been deprecated; please use 'amdflang' instead\n"; + } + } + if (!DS) return {}; size_t SuffixEnd = SuffixPos + strlen(DS->Suffix); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index ee6ee08c27d92..c6a9b67d08bec 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -68,6 +68,23 @@ using namespace clang::driver::tools; using namespace clang; using namespace llvm::opt; +static bool addRPathCmdArg(const llvm::opt::ArgList &Args, + ArgStringList &CmdArgs, + const std::string pathCandidate, + bool onlyIfPathExists = true) { + SmallString<0> simplifiedPathCandidate(pathCandidate); + llvm::sys::path::remove_dots(simplifiedPathCandidate, true); + + bool pathExists = llvm::sys::fs::exists(simplifiedPathCandidate); + + if (onlyIfPathExists && !pathExists) + return false; + + CmdArgs.push_back("-rpath"); + CmdArgs.push_back(Args.MakeArgString(simplifiedPathCandidate)); + return pathExists; +} + static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args, const llvm::Triple &Triple) { if (Args.hasArg(clang::driver::options::OPT_pg) && @@ -1351,12 +1368,8 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC, // one of the LIBRARY_PATH directories. ArgStringList EnvLibraryPaths; addDirectoryList(Args, EnvLibraryPaths, "", "LIBRARY_PATH"); - for (auto &EnvLibraryPath : EnvLibraryPaths) { - if (llvm::sys::fs::exists(EnvLibraryPath)) { - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(EnvLibraryPath)); - } - } + for (auto &EnvLibraryPath : EnvLibraryPaths) + addRPathCmdArg(Args, CmdArgs, EnvLibraryPath); if (Args.hasFlag(options::OPT_fopenmp_implicit_rpath, options::OPT_fno_openmp_implicit_rpath, true)) { @@ -1365,46 +1378,33 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC, SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(TC.getDriver().Dir); llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME); - if (TC.getSanitizerArgs(Args).needsAsanRt()) { - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(TC.getCompilerRTPath())); - } + if (TC.getSanitizerArgs(Args).needsAsanRt()) + addRPathCmdArg(Args, CmdArgs, TC.getCompilerRTPath(), + /*onlyIfPathExists=*/false); // In case LibSuffix was not built, try lib std::string CandidateRPath_suf = D.Dir + "/../" + LibSuffix; - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(CandidateRPath_suf.c_str())); - // Add lib directory in case LibSuffix does not exist std::string CandidateRPath_lib = D.Dir + "/../lib"; - if ((!llvm::sys::fs::exists(CandidateRPath_suf)) && - (llvm::sys::fs::exists(CandidateRPath_lib))) { - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(CandidateRPath_lib.c_str())); - } + if (!addRPathCmdArg(Args, CmdArgs, CandidateRPath_suf, + /*onlyIfPathExists=*/false)) + addRPathCmdArg(Args, CmdArgs, CandidateRPath_lib); std::string rocmPath = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ).str(); if (rocmPath.size() != 0) { std::string rocmPath_lib = rocmPath + "/lib"; std::string rocmPath_suf = rocmPath + "/" + LibSuffix; - if (llvm::sys::fs::exists(rocmPath_suf)) { - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(rocmPath_suf.c_str())); - } else if (llvm::sys::fs::exists(rocmPath_lib)) { - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(rocmPath_lib.c_str())); - } + if (!addRPathCmdArg(Args, CmdArgs, rocmPath_suf)) + addRPathCmdArg(Args, CmdArgs, rocmPath_lib); } // Add Default lib path to ensure llvm dynamic library is picked up for // lib-debug/lib-perf - if (LibSuffix != "lib" && llvm::sys::fs::exists(DefaultLibPath)){ - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(DefaultLibPath.c_str())); - } + if (LibSuffix != "lib") + addRPathCmdArg(Args, CmdArgs, DefaultLibPath.c_str()); - if (llvm::find_if(CmdArgs, [](StringRef str) { + if (llvm::find_if(CmdArgs, [](StringRef str) { return !str.compare("--enable-new-dtags"); }) == CmdArgs.end()) CmdArgs.push_back("--disable-new-dtags"); @@ -1444,10 +1444,8 @@ void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args, CandidateRPaths.emplace_back(*CandidateRPath); for (const auto &CandidateRPath : CandidateRPaths) { - if (TC.getVFS().exists(CandidateRPath)) { - CmdArgs.push_back("-rpath"); - CmdArgs.push_back(Args.MakeArgString(CandidateRPath)); - } + if (TC.getVFS().exists(CandidateRPath)) + addRPathCmdArg(Args, CmdArgs, CandidateRPath, /*onlyIfPathExists=*/false); } } diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c index 583e383196499..12f283707308e 100644 --- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -28,7 +28,7 @@ // AMDGCNSPIRV-LABEL: define spir_func void @foo( // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { // AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] -// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load volatile i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 // AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false // AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] // AMDGCNSPIRV: [[IF_THEN]]: diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c index f3c24e66894c1..76dead8ebbe89 100644 --- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -26,7 +26,7 @@ // AMDGCNSPIRV-LABEL: define spir_func void @foo( // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { // AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] -// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load volatile i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 // AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false // AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] // AMDGCNSPIRV: [[IF_THEN]]: diff --git a/clang/test/Driver/clang-offload-bundler-zlib.c b/clang/test/Driver/clang-offload-bundler-zlib.c index b026e2ec99877..211601c2c7fbb 100644 --- a/clang/test/Driver/clang-offload-bundler-zlib.c +++ b/clang/test/Driver/clang-offload-bundler-zlib.c @@ -66,6 +66,30 @@ // NOHOST-V3-DAG: hip-amdgcn-amd-amdhsa--gfx900 // NOHOST-V3-DAG: hip-amdgcn-amd-amdhsa--gfx906 +// Check compression/decompression of offload bundle using version 2 format. +// +// RUN: env OFFLOAD_BUNDLER_COMPRESS=1 OFFLOAD_BUNDLER_VERBOSE=1 COMPRESSED_BUNDLE_FORMAT_VERSION=2 \ +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ +// RUN: -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc 2>&1 | \ +// RUN: FileCheck -check-prefix=COMPRESS-V2 %s +// RUN: clang-offload-bundler -type=bc -list -input=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST-V2 %s +// RUN: env OFFLOAD_BUNDLER_VERBOSE=1 \ +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ +// RUN: -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle 2>&1 | \ +// RUN: FileCheck -check-prefix=DECOMPRESS-V2 %s +// RUN: diff %t.tgt1 %t.res.tgt1 +// RUN: diff %t.tgt2 %t.res.tgt2 +// +// COMPRESS-V2: Compressed bundle format version: 2 +// COMPRESS-V2: Compression method used: zlib +// COMPRESS-V2: Compression level: 6 +// DECOMPRESS-V2: Compressed bundle format version: 2 +// DECOMPRESS-V2: Decompression method: zlib +// DECOMPRESS-V2: Hashes match: Yes +// NOHOST-V2-NOT: host- +// NOHOST-V2-DAG: hip-amdgcn-amd-amdhsa--gfx900 +// NOHOST-V2-DAG: hip-amdgcn-amd-amdhsa--gfx906 + // Check -compression-level= option // RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ diff --git a/clang/test/Driver/clang-offload-bundler-zstd.c b/clang/test/Driver/clang-offload-bundler-zstd.c index 667d9554daec7..c1123ae5acb38 100644 --- a/clang/test/Driver/clang-offload-bundler-zstd.c +++ b/clang/test/Driver/clang-offload-bundler-zstd.c @@ -29,11 +29,11 @@ // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // -// CHECK: Compressed bundle format version: 2 +// CHECK: Compressed bundle format version: 3 // CHECK: Total file size (including headers): [[SIZE:[0-9]*]] bytes // CHECK: Compression method used: zstd // CHECK: Compression level: 3 -// CHECK: Compressed bundle format version: 2 +// CHECK: Compressed bundle format version: 3 // CHECK: Total file size (from header): [[SIZE]] bytes // CHECK: Decompression method: zstd // CHECK: Hashes match: Yes diff --git a/clang/test/Driver/openmp-runtimelib.c b/clang/test/Driver/openmp-runtimelib.c index ad4bff0a70d9f..09600f2c376e3 100644 --- a/clang/test/Driver/openmp-runtimelib.c +++ b/clang/test/Driver/openmp-runtimelib.c @@ -1,38 +1,48 @@ // REQUIRES: amdgpu-registered-target -// Asan-Debug: /lib-debug/asan -// Asan-Devel: /lib/asan -// Asan-Perf: /lib-perf/asan - // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=lib-debug %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefixes=Debug %s +// RUN: | FileCheck -check-prefixes=Debug,Debug-Rel %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=lib-perf %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefixes=Perf %s +// RUN: | FileCheck -check-prefixes=Perf,Perf-Rel %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=lib %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefixes=Devel %s +// RUN: | FileCheck -check-prefixes=Devel,Devel-Rel %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-target-fast %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefixes=Default %s +// RUN: | FileCheck -check-prefixes=Devel,Devel-Rel %s // RUN: not %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a -fopenmp-runtimelib=oopsy %s -O3 2>&1 \ // RUN: | FileCheck -check-prefixes=Error %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-runtimelib=lib-debug -fsanitize=address -shared-libasan %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefix=Asan-Debug %s +// RUN: | FileCheck -check-prefixes=Asan-Debug,Asan-Debug-Rel %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-runtimelib=lib -fsanitize=address -shared-libasan %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefix=Asan-Devel %s +// RUN: | FileCheck -check-prefixes=Asan-Devel,Asan-Devel-Rel %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-runtimelib=lib-perf -fsanitize=address -shared-libasan %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefix=Asan-Perf %s +// RUN: | FileCheck -check-prefixes=Asan-Perf,Asan-Perf-Rel %s // RUN: %clang -### -fopenmp -nogpuinc -nogpulib --offload-arch=gfx90a:xnack+ -fopenmp-target-fast -fsanitize=address -shared-libasan %s -O3 2>&1 \ -// RUN: | FileCheck -check-prefix=Asan-Devel %s +// RUN: | FileCheck -check-prefixes=Asan-Devel,Asan-Devel-Rel %s + +// Devel: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib]]" +// Devel-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]" + +// Debug: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-debug]]" +// Debug-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]" + +// Perf: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-perf]]" +// Perf-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]" + +// Asan-Devel: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib(/|\\\\)asan]]" +// Asan-Devel-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]" + +// Asan-Debug: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-debug(/|\\\\)asan]]" +// Asan-Debug-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]" + +// Asan-Perf: "-rpath" "{{[^"]*}}[[LIB:(/|\\\\)lib-perf(/|\\\\)asan]]" +// Asan-Perf-Rel-NOT: "-rpath" "{{[^"]*(/|\\\\)\.\.}}[[LIB]]" -// Debug: /lib-debug -// Perf: /lib-perf -// Devel: /../lib -// Default: /../lib // Error: clang: error: unsupported argument 'oopsy' to option '-fopenmp-runtimelib=' diff --git a/clang/tools/amdllvm/CMakeLists.txt b/clang/tools/amdllvm/CMakeLists.txt index 964aeadfddb0c..346f7b70d3a1a 100644 --- a/clang/tools/amdllvm/CMakeLists.txt +++ b/clang/tools/amdllvm/CMakeLists.txt @@ -11,7 +11,7 @@ option(CLANG_LINK_FLANG "Create flang install link to clang" ON) list(APPEND CLANG_LINKS_TO_CREATE clang clang++ clang-cl clang-cpp clang-${CLANG_VERSION_MAJOR} lld) if(CLANG_LINK_FLANG) - list(APPEND CLANG_LINKS_TO_CREATE flang) + list(APPEND CLANG_LINKS_TO_CREATE flang flang-new) endif() foreach(link ${CLANG_LINKS_TO_CREATE}) diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp index 8d7b3abe75c02..82409509a96b1 100644 --- a/compiler-rt/lib/asan/asan_allocator.cpp +++ b/compiler-rt/lib/asan/asan_allocator.cpp @@ -1304,6 +1304,9 @@ DECLARE_REAL(hsa_status_t, hsa_amd_ipc_memory_attach, const hsa_amd_ipc_memory_t *handle, size_t len, uint32_t num_agents, const hsa_agent_t *mapping_agents, void **mapped_ptr) DECLARE_REAL(hsa_status_t, hsa_amd_ipc_memory_detach, void *mapped_ptr) +DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_reserve_align, void** ptr, + size_t size, uint64_t address, uint64_t alignment, uint64_t flags) +DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size); namespace __asan { @@ -1332,9 +1335,8 @@ hsa_status_t asan_hsa_amd_memory_pool_free( if (p) { instance.Deallocate(ptr, 0, 0, stack, FROM_MALLOC); return HSA_STATUS_SUCCESS; - } else { - return REAL(hsa_amd_memory_pool_free)(ptr); } + return REAL(hsa_amd_memory_pool_free)(ptr); } hsa_status_t asan_hsa_amd_agents_allow_access( @@ -1342,11 +1344,8 @@ hsa_status_t asan_hsa_amd_agents_allow_access( const void *ptr, BufferedStackTrace *stack) { void *p = get_allocator().GetBlockBegin(ptr); - if (p) { - return REAL(hsa_amd_agents_allow_access)(num_agents, agents, flags, p); - } else { - return REAL(hsa_amd_agents_allow_access)(num_agents, agents, flags, ptr); - } + return REAL(hsa_amd_agents_allow_access)(num_agents, agents, flags, + p ? p : ptr); } // For asan allocator, kMetadataSize is 0 and maximum redzone size is 2048. This @@ -1394,5 +1393,59 @@ hsa_status_t asan_hsa_amd_ipc_memory_detach(void *mapped_ptr) { reinterpret_cast(reinterpret_cast(mapped_ptr) - kPageSize_); return REAL(hsa_amd_ipc_memory_detach)(mapped_ptr_); } + +hsa_status_t asan_hsa_amd_vmem_address_reserve_align( + void** ptr, size_t size, uint64_t address, uint64_t alignment, + uint64_t flags, BufferedStackTrace* stack) { + // Bypass the tracking for a fixed address since it cannot be supported. + // Reasons: + // 1. Address may not meet the alignment/page-size requirement. + // 2. Requested range overlaps an existing reserved/mapped range. + // 3. Insufficient VA space to honor that exact placement. + if (address) + return REAL(hsa_amd_vmem_address_reserve_align)(ptr, size, address, + alignment, flags); + + if (alignment < kPageSize_) + alignment = kPageSize_; + + if (UNLIKELY(!IsPowerOfTwo(alignment))) { + errno = errno_EINVAL; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + AmdgpuAllocationInfo aa_info; + aa_info.alloc_func = + reinterpret_cast(asan_hsa_amd_vmem_address_reserve_align); + aa_info.memory_pool = {0}; + aa_info.size = size; + aa_info.flags64 = flags; + aa_info.address = 0; + aa_info.alignment = alignment; + aa_info.ptr = nullptr; + SetErrnoOnNull(*ptr = instance.Allocate(size, alignment, stack, FROM_MALLOC, + false, &aa_info)); + + return aa_info.status; +} + +hsa_status_t asan_hsa_amd_vmem_address_free(void* ptr, size_t size, + BufferedStackTrace* stack) { + if (UNLIKELY(!IsAligned(reinterpret_cast(ptr), kPageSize_))) { + errno = errno_EINVAL; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (size == 0) { + errno = errno_EINVAL; + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + void* p = get_allocator().GetBlockBegin(ptr); + if (p) { + instance.Deallocate(ptr, 0, 0, stack, FROM_MALLOC); + return HSA_STATUS_SUCCESS; + } + return REAL(hsa_amd_vmem_address_free)(ptr, size); +} } // namespace __asan #endif diff --git a/compiler-rt/lib/asan/asan_allocator.h b/compiler-rt/lib/asan/asan_allocator.h index 3f7914140e24d..6132fbb64c75d 100644 --- a/compiler-rt/lib/asan/asan_allocator.h +++ b/compiler-rt/lib/asan/asan_allocator.h @@ -320,6 +320,13 @@ hsa_status_t asan_hsa_amd_ipc_memory_attach( const hsa_agent_t* mapping_agents, void** mapped_ptr); hsa_status_t asan_hsa_amd_ipc_memory_detach( void* mapped_ptr); +hsa_status_t asan_hsa_amd_vmem_address_reserve_align(void** ptr, size_t size, + uint64_t address, + uint64_t alignment, + uint64_t flags, + BufferedStackTrace* stack); +hsa_status_t asan_hsa_amd_vmem_address_free(void* ptr, size_t size, + BufferedStackTrace* stack); } // namespace __asan #endif diff --git a/compiler-rt/lib/asan/asan_errors.cpp b/compiler-rt/lib/asan/asan_errors.cpp index 77516182bb8dd..5f4b839cf2412 100644 --- a/compiler-rt/lib/asan/asan_errors.cpp +++ b/compiler-rt/lib/asan/asan_errors.cpp @@ -691,11 +691,11 @@ void ErrorNonSelfGeneric::Print() { Decorator d; Printf("%s", d.Error()); Report("ERROR: AddressSanitizer: %s on address %p at pc %p\n", bug_descr, - (void *)addresses[0], callstack[0]); + (void *)addresses[0], (void *)callstack[0]); Printf("%s%s of size %zu at %p thread id %zu\n", d.Access(), access_size ? (is_write ? "WRITE" : "READ") : "ACCESS", access_size, - (void *)addresses[0], thread_id[0]); + (void *)addresses[0], (usize)thread_id[0]); // todo: perform symbolization for the given callstack // can be done by creating in-memory object file or by writing @@ -733,7 +733,7 @@ ErrorNonSelfAMDGPU::ErrorNonSelfAMDGPU(uptr *dev_callstack, u32 n_callstack, void ErrorNonSelfAMDGPU::PrintStack() { InternalScopedString source_location; - source_location.AppendF(" #0 %p", callstack[0]); + source_location.AppendF(" #0 %p", (void *)callstack[0]); #if SANITIZER_AMDGPU source_location.Append(" in "); __sanitizer::AMDGPUCodeObjectSymbolizer symbolizer; @@ -754,7 +754,8 @@ void ErrorNonSelfAMDGPU::PrintThreadsAndAddresses() { str.Append("\n"); per_row_count = 0; } - str.AppendF("%02d : %p ", workitem_ids[idx], device_address[idx]); + str.AppendF("%02d : %p ", (int)workitem_ids[idx], + (void *)device_address[idx]); per_row_count++; } str.Append("\n"); @@ -797,11 +798,12 @@ void ErrorNonSelfAMDGPU::PrintMallocStack() { uptr plo = ScanForMagicDown(start, lo, magic, lo); if (plo) { callstack[0] = ((uptr*)plo)[2]; - Printf("%s%p is %u bytes above an address from a %sdevice malloc " - "(or free) call of size %u from%s\n", - d.Location(), device_address[0], - (int)(device_address[0] - (plo+offset)), - d.Allocation(), ((int*)plo)[7], d.Default()); + Printf( + "%s%p is %u bytes above an address from a %sdevice malloc " + "(or free) call of size %u from%s\n", + d.Location(), (void *)device_address[0], + (u32)(device_address[0] - (plo + offset)), d.Allocation(), + ((u32*)plo)[7], d.Default()); // TODO: The code object with the malloc call may not be the same // code object trying the illegal access. A mechanism is needed // to obtain the former. @@ -811,12 +813,13 @@ void ErrorNonSelfAMDGPU::PrintMallocStack() { uptr phi = ScanForMagicUp(start, hi, magic, lo); if (phi) { callstack[0] = ((uptr*)phi)[2]; - Printf("%s%p is %u bytes below an address from a %sdevice malloc " - "(or free) call of size %u from%s\n", - d.Location(), device_address[0], - (int)((phi+offset) - device_address[0]), + Printf( + "%s%p is %u bytes below an address from a %sdevice malloc " + "(or free) call of size %u from%s\n", + d.Location(), (void *)device_address[0], + (u32)((phi + offset) - device_address[0]), - d.Allocation(), ((int*)phi)[7], d.Default()); + d.Allocation(), ((u32*)phi)[7], d.Default()); PrintStack(); } } @@ -825,10 +828,11 @@ void ErrorNonSelfAMDGPU::PrintMallocStack() { void ErrorNonSelfAMDGPU::Print() { Decorator d; Printf("%s", d.Error()); - Report("ERROR: AddressSanitizer: %s on amdgpu device %zu at pc %p\n", - bug_descr, device_id, callstack[0]); - Printf("%s%s of size %zu in workgroup id (%zu,%zu,%zu)\n", d.Access(), - (is_write ? "WRITE" : "READ"), access_size, wg.idx, wg.idy, wg.idz); + Report("ERROR: AddressSanitizer: %s on amdgpu device %d at pc %p\n", + bug_descr, device_id, (void *)callstack[0]); + Printf("%s%s of size %zu in workgroup id (%llu,%llu,%llu)\n", d.Access(), + (is_write ? "WRITE" : "READ"), access_size, wg.idx, + wg.idy, wg.idz); Printf("%s", d.Default()); PrintStack(); Printf("%s", d.Location()); diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp index 6e7239eea3146..978854c70be8d 100644 --- a/compiler-rt/lib/asan/asan_interceptors.cpp +++ b/compiler-rt/lib/asan/asan_interceptors.cpp @@ -897,6 +897,22 @@ INTERCEPTOR(hsa_status_t, hsa_amd_ipc_memory_detach, void* mapped_ptr) { return asan_hsa_amd_ipc_memory_detach(mapped_ptr); } +INTERCEPTOR(hsa_status_t, hsa_amd_vmem_address_reserve_align, void** ptr, + size_t size, uint64_t address, uint64_t alignment, uint64_t flags) { + AsanInitFromRtl(); + ENSURE_HSA_INITED(); + GET_STACK_TRACE_MALLOC; + return asan_hsa_amd_vmem_address_reserve_align(ptr, size, address, alignment, + flags, &stack); +} + +INTERCEPTOR(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size) { + AsanInitFromRtl(); + ENSURE_HSA_INITED(); + GET_STACK_TRACE_FREE; + return asan_hsa_amd_vmem_address_free(ptr, size, &stack); +} + void InitializeAmdgpuInterceptors() { ASAN_INTERCEPT_FUNC(hsa_memory_copy); ASAN_INTERCEPT_FUNC(hsa_amd_memory_pool_allocate); @@ -909,6 +925,8 @@ void InitializeAmdgpuInterceptors() { ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_create); ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_attach); ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_detach); + ASAN_INTERCEPT_FUNC(hsa_amd_vmem_address_reserve_align); + ASAN_INTERCEPT_FUNC(hsa_amd_vmem_address_free); } void ENSURE_HSA_INITED() { diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 886e93e5fa813..d09a9a70fd83b 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -792,7 +792,7 @@ static void PrintNoOriginTrackingWarning() { static void PrintNoTaintWarning(const void *address) { Decorator d; - Printf(" %sDFSan: no tainted value at %x%s\n", d.Warning(), address, + Printf(" %sDFSan: no tainted value at %zx%s\n", d.Warning(), (uptr)address, d.Default()); } diff --git a/compiler-rt/lib/hwasan/hwasan.cpp b/compiler-rt/lib/hwasan/hwasan.cpp index 24384d8b4d2cf..615bae4b3a3fc 100644 --- a/compiler-rt/lib/hwasan/hwasan.cpp +++ b/compiler-rt/lib/hwasan/hwasan.cpp @@ -176,7 +176,7 @@ static void HwasanFormatMemoryUsage(InternalScopedString &s) { "HWASAN pid: %d rss: %zd threads: %zd stacks: %zd" " thr_aux: %zd stack_depot: %zd uniq_stacks: %zd" " heap: %zd", - internal_getpid(), GetRSS(), thread_stats.n_live_threads, + (int)internal_getpid(), GetRSS(), thread_stats.n_live_threads, thread_stats.total_stack_size, thread_stats.n_live_threads * thread_list.MemoryUsedPerThread(), sds.allocated, sds.n_uniq_ids, asc[AllocatorStatMapped]); @@ -692,7 +692,7 @@ void __hwasan_handle_longjmp(const void *sp_dst) { "WARNING: HWASan is ignoring requested __hwasan_handle_longjmp: " "stack top: %p; target %p; distance: %p (%zd)\n" "False positive error reports may follow\n", - (void *)sp, (void *)dst, dst - sp, dst - sp); + (void *)sp, (void *)dst, (void *)(dst - sp), dst - sp); return; } TagMemory(sp, dst - sp, 0); diff --git a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h index 7d134e8c4b7fa..52a28438f3a9b 100644 --- a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h +++ b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h @@ -41,7 +41,7 @@ static inline bool malloc_bisect(StackTrace *stack, uptr orig_size) { if (h < left || h > right) return false; if (flags()->malloc_bisect_dump) { - Printf("[alloc] %u %zu\n", h, orig_size); + Printf("[alloc] %u %zu\n", (u32)h, orig_size); stack->Print(); } return true; diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp index bc66e6e805c91..6eafcf9163afa 100644 --- a/compiler-rt/lib/hwasan/hwasan_report.cpp +++ b/compiler-rt/lib/hwasan/hwasan_report.cpp @@ -306,8 +306,9 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa, "%p is located %zd bytes %s a %zd-byte local variable %s " "[%p,%p) " "in %s %s\n", - untagged_addr, offset, whence, local.size, local.name, best_beg, - best_beg + local.size, local.function_name, location.data()); + (void *)untagged_addr, offset, whence, local.size, local.name, + (void *)best_beg, (void *)(best_beg + local.size), + local.function_name, location.data()); location.clear(); Printf("%s\n", d.Default()); } @@ -738,8 +739,8 @@ void BaseReport::PrintHeapOrGlobalCandidate() const { Printf("%s", d.Location()); Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n", untagged_addr, offset, whence, - candidate.heap.end - candidate.heap.begin, candidate.heap.begin, - candidate.heap.end); + candidate.heap.end - candidate.heap.begin, + (void *)candidate.heap.begin, (void *)candidate.heap.end); Printf("%s", d.Allocation()); Printf("allocated by thread T%u here:\n", candidate.heap.thread_id); Printf("%s", d.Default()); @@ -762,11 +763,11 @@ void BaseReport::PrintHeapOrGlobalCandidate() const { Printf( "%p is located %zd bytes %s a %zd-byte global variable " "%s [%p,%p) in %s\n", - untagged_addr, + (void *)untagged_addr, candidate.after ? untagged_addr - (info.start + info.size) : info.start - untagged_addr, candidate.after ? "after" : "before", info.size, info.name, - info.start, info.start + info.size, module_name); + (void *)info.start, (void *)(info.start + info.size), module_name); } else { uptr size = GetGlobalSizeFromDescriptor(candidate.untagged_addr); if (size == 0) @@ -774,14 +775,14 @@ void BaseReport::PrintHeapOrGlobalCandidate() const { Printf( "%p is located %s a global variable in " "\n #0 0x%x (%s+0x%x)\n", - untagged_addr, candidate.after ? "after" : "before", - candidate.untagged_addr, module_name, module_address); + (void *)untagged_addr, candidate.after ? "after" : "before", + (void *)candidate.untagged_addr, module_name, (u32)module_address); else Printf( "%p is located %s a %zd-byte global variable in " "\n #0 0x%x (%s+0x%x)\n", - untagged_addr, candidate.after ? "after" : "before", size, - candidate.untagged_addr, module_name, module_address); + (void *)untagged_addr, candidate.after ? "after" : "before", size, + (void *)candidate.untagged_addr, module_name, (u32)module_address); } Printf("%s", d.Default()); } @@ -792,8 +793,8 @@ void BaseReport::PrintAddressDescription() const { int num_descriptions_printed = 0; if (MemIsShadow(untagged_addr)) { - Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), untagged_addr, - d.Default()); + Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), + (void *)untagged_addr, d.Default()); return; } @@ -802,7 +803,7 @@ void BaseReport::PrintAddressDescription() const { Printf( "%s[%p,%p) is a %s %s heap chunk; " "size: %zd offset: %zd\n%s", - d.Location(), heap.begin, heap.begin + heap.size, + d.Location(), (void *)heap.begin, (void *)(heap.begin + heap.size), heap.from_small_heap ? "small" : "large", heap.is_allocated ? "allocated" : "unallocated", heap.size, untagged_addr - heap.begin, d.Default()); @@ -821,8 +822,8 @@ void BaseReport::PrintAddressDescription() const { Printf("%s", d.Error()); Printf("\nCause: stack tag-mismatch\n"); Printf("%s", d.Location()); - Printf("Address %p is located in stack of thread T%zd\n", untagged_addr, - sa.thread_id()); + Printf("Address %p is located in stack of thread T%zd\n", + (void *)untagged_addr, (ssize)sa.thread_id()); Printf("%s", d.Default()); announce_by_id(sa.thread_id()); PrintStackAllocations(sa.get(), ptr_tag, untagged_addr); @@ -842,9 +843,9 @@ void BaseReport::PrintAddressDescription() const { Printf("\nCause: use-after-free\n"); Printf("%s", d.Location()); Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n", - untagged_addr, untagged_addr - UntagAddr(har.tagged_addr), - har.requested_size, UntagAddr(har.tagged_addr), - UntagAddr(har.tagged_addr) + har.requested_size); + (void *)untagged_addr, untagged_addr - UntagAddr(har.tagged_addr), + (ssize)har.requested_size, UntagAddr(har.tagged_addr), + (void *)(UntagAddr(har.tagged_addr) + har.requested_size)); Printf("%s", d.Allocation()); Printf("freed by thread T%u here:\n", ha.free_thread_id); Printf("%s", d.Default()); @@ -858,7 +859,7 @@ void BaseReport::PrintAddressDescription() const { // Print a developer note: the index of this heap object // in the thread's deallocation ring buffer. Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", ha.ring_index + 1, - flags()->heap_history_size); + (ssize)flags()->heap_history_size); Printf("hwasan_dev_note_num_matching_addrs: %zd\n", ha.num_matching_addrs); Printf("hwasan_dev_note_num_matching_addrs_4b: %zd\n", ha.num_matching_addrs_4b); @@ -915,10 +916,11 @@ InvalidFreeReport::~InvalidFreeReport() { const Thread *thread = GetCurrentThread(); if (thread) { Report("ERROR: %s: %s on address %p at pc %p on thread T%zd\n", - SanitizerToolName, bug_type, untagged_addr, pc, thread->unique_id()); + SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc, + (ssize)thread->unique_id()); } else { Report("ERROR: %s: %s on address %p at pc %p on unknown thread\n", - SanitizerToolName, bug_type, untagged_addr, pc); + SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc); } Printf("%s", d.Access()); if (shadow.addr) { @@ -967,7 +969,8 @@ TailOverwrittenReport::~TailOverwrittenReport() { Printf("%s", d.Error()); const char *bug_type = "allocation-tail-overwritten"; Report("ERROR: %s: %s; heap object [%p,%p) of size %zd\n", SanitizerToolName, - bug_type, untagged_addr, untagged_addr + orig_size, orig_size); + bug_type, (void *)untagged_addr, (void *)(untagged_addr + orig_size), + orig_size); Printf("\n%s", d.Default()); Printf( "Stack of invalid access unknown. Issue detected at deallocation " @@ -1037,7 +1040,7 @@ TagMismatchReport::~TagMismatchReport() { uptr pc = GetTopPc(stack); Printf("%s", d.Error()); Report("ERROR: %s: %s on address %p at pc %p\n", SanitizerToolName, bug_type, - untagged_addr, pc); + (void *)untagged_addr, (void *)pc); Thread *t = GetCurrentThread(); @@ -1049,12 +1052,12 @@ TagMismatchReport::~TagMismatchReport() { GetShortTagCopy(MemToShadow(untagged_addr + mismatch_offset)); Printf( "%s of size %zu at %p tags: %02x/%02x(%02x) (ptr/mem) in thread T%zd\n", - is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag, - mem_tag, short_tag, t->unique_id()); + is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr, + ptr_tag, mem_tag, short_tag, (ssize)t->unique_id()); } else { Printf("%s of size %zu at %p tags: %02x/%02x (ptr/mem) in thread T%zd\n", - is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag, - mem_tag, t->unique_id()); + is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr, + ptr_tag, mem_tag, (ssize)t->unique_id()); } if (mismatch_offset) Printf("Invalid access starting at offset %zu\n", mismatch_offset); @@ -1093,7 +1096,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, // See the frame breakdown defined in __hwasan_tag_mismatch (from // hwasan_tag_mismatch_{aarch64,riscv64}.S). void ReportRegisters(const uptr *frame, uptr pc) { - Printf("\nRegisters where the failure occurred (pc %p):\n", pc); + Printf("\nRegisters where the failure occurred (pc %p):\n", (void *)pc); // We explicitly print a single line (4 registers/line) each iteration to // reduce the amount of logcat error messages printed. Each Printf() will diff --git a/compiler-rt/lib/hwasan/hwasan_thread.cpp b/compiler-rt/lib/hwasan/hwasan_thread.cpp index 8b32e4e760e2f..7e59ee8fc076d 100644 --- a/compiler-rt/lib/hwasan/hwasan_thread.cpp +++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp @@ -120,9 +120,10 @@ void Thread::Destroy() { } void Thread::Print(const char *Prefix) { - Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix, unique_id_, - (void *)this, stack_bottom(), stack_top(), - stack_top() - stack_bottom(), tls_begin(), tls_end()); + Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix, + (ssize)unique_id_, (void *)this, (void *)stack_bottom(), + (void *)stack_top(), stack_top() - stack_bottom(), (void *)tls_begin(), + (void *)tls_end()); } static u32 xorshift(u32 state) { diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp index b17a17e1193bc..6d8a7a06d8458 100644 --- a/compiler-rt/lib/lsan/lsan_common.cpp +++ b/compiler-rt/lib/lsan/lsan_common.cpp @@ -806,7 +806,7 @@ static bool ReportUnsuspendedThreads( succeded = false; Report( "Running thread %zu was not suspended. False leaks are possible.\n", - os_id); + (usize)os_id); } } return succeded; diff --git a/compiler-rt/lib/memprof/memprof_shadow_setup.cpp b/compiler-rt/lib/memprof/memprof_shadow_setup.cpp index e7832f656ee8e..7712a94fde3d6 100644 --- a/compiler-rt/lib/memprof/memprof_shadow_setup.cpp +++ b/compiler-rt/lib/memprof/memprof_shadow_setup.cpp @@ -29,7 +29,7 @@ static void ProtectGap(uptr addr, uptr size) { Printf("protect_shadow_gap=0:" " not protecting shadow gap, allocating gap's shadow\n" "|| `[%p, %p]` || ShadowGap's shadow ||\n", - GapShadowBeg, GapShadowEnd); + (void *)GapShadowBeg, (void *)GapShadowEnd); ReserveShadowMemoryRange(GapShadowBeg, GapShadowEnd, "unprotected gap shadow"); return; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp index 5fb47c9f9a0b0..cf10cb773e746 100755 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.cpp @@ -22,6 +22,11 @@ struct HsaMemoryFunctions { void *(*alloc)(size_t), uint32_t *num_agents_accessible, hsa_agent_t **accessible); + hsa_status_t (*vmem_address_reserve_align)(void** ptr, size_t size, + uint64_t address, + uint64_t alignment, + uint64_t flags); + hsa_status_t (*vmem_address_free)(void* ptr, size_t size); }; static HsaMemoryFunctions hsa_amd; @@ -37,20 +42,30 @@ bool AmdgpuMemFuncs::Init() { RTLD_NEXT, "hsa_amd_memory_pool_free"); hsa_amd.pointer_info = (decltype(hsa_amd.pointer_info))dlsym( RTLD_NEXT, "hsa_amd_pointer_info"); + hsa_amd.vmem_address_reserve_align = + (decltype(hsa_amd.vmem_address_reserve_align))dlsym( + RTLD_NEXT, "hsa_amd_vmem_address_reserve_align"); + hsa_amd.vmem_address_free = (decltype(hsa_amd.vmem_address_free))dlsym( + RTLD_NEXT, "hsa_amd_vmem_address_free"); if (!hsa_amd.memory_pool_allocate || !hsa_amd.memory_pool_free || - !hsa_amd.pointer_info) + !hsa_amd.pointer_info || !hsa_amd.vmem_address_reserve_align || + !hsa_amd.vmem_address_free) return false; - else - return true; + return true; } void *AmdgpuMemFuncs::Allocate(uptr size, uptr alignment, DeviceAllocationInfo *da_info) { AmdgpuAllocationInfo *aa_info = reinterpret_cast(da_info); - - aa_info->status = hsa_amd.memory_pool_allocate(aa_info->memory_pool, size, - aa_info->flags, &aa_info->ptr); + if (!aa_info->memory_pool.handle) { + aa_info->status = hsa_amd.vmem_address_reserve_align( + &aa_info->ptr, size, aa_info->address, aa_info->alignment, + aa_info->flags64); + } else { + aa_info->status = hsa_amd.memory_pool_allocate( + aa_info->memory_pool, size, aa_info->flags, &aa_info->ptr); + } if (aa_info->status != HSA_STATUS_SUCCESS) return nullptr; @@ -58,10 +73,18 @@ void *AmdgpuMemFuncs::Allocate(uptr size, uptr alignment, } void AmdgpuMemFuncs::Deallocate(void *p) { - UNUSED hsa_status_t status = hsa_amd.memory_pool_free(p); + DevicePointerInfo DevPtrInfo; + if (AmdgpuMemFuncs::GetPointerInfo(reinterpret_cast(p), &DevPtrInfo)) { + if (DevPtrInfo.type == HSA_EXT_POINTER_TYPE_HSA) { + UNUSED hsa_status_t status = hsa_amd.memory_pool_free(p); + } else if (DevPtrInfo.type == HSA_EXT_POINTER_TYPE_RESERVED_ADDR) { + UNUSED hsa_status_t status = + hsa_amd.vmem_address_free(p, DevPtrInfo.map_size); + } + } } -bool AmdgpuMemFuncs::GetPointerInfo(uptr ptr, DevivePointerInfo *ptr_info) { +bool AmdgpuMemFuncs::GetPointerInfo(uptr ptr, DevicePointerInfo* ptr_info) { hsa_amd_pointer_info_t info; info.size = sizeof(hsa_amd_pointer_info_t); hsa_status_t status = @@ -70,8 +93,12 @@ bool AmdgpuMemFuncs::GetPointerInfo(uptr ptr, DevivePointerInfo *ptr_info) { if (status != HSA_STATUS_SUCCESS) return false; - ptr_info->map_beg = reinterpret_cast(info.agentBaseAddress); + if (info.type == HSA_EXT_POINTER_TYPE_RESERVED_ADDR) + ptr_info->map_beg = reinterpret_cast(info.hostBaseAddress); + else if (info.type == HSA_EXT_POINTER_TYPE_HSA) + ptr_info->map_beg = reinterpret_cast(info.agentBaseAddress); ptr_info->map_size = info.sizeInBytes; + ptr_info->type = reinterpret_cast(info.type); return true; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h index 634731703aba3..84b62964e5145 100755 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_amdgpu.h @@ -20,7 +20,7 @@ class AmdgpuMemFuncs { static void *Allocate(uptr size, uptr alignment, DeviceAllocationInfo *da_info); static void Deallocate(void *p); - static bool GetPointerInfo(uptr ptr, DevivePointerInfo *ptr_info); + static bool GetPointerInfo(uptr ptr, DevicePointerInfo* ptr_info); static uptr GetPageSize(); }; @@ -32,8 +32,11 @@ struct AmdgpuAllocationInfo : public DeviceAllocationInfo { hsa_status_t status; void *alloc_func; hsa_amd_memory_pool_t memory_pool; - size_t size; - uint32_t flags; + u64 alignment; + u64 address; + u64 flags64; + usize size; + u32 flags; void *ptr; }; #endif // SANITIZER_AMDGPU diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h index 9feb0549b33b3..f76800da79ac3 100755 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_device.h @@ -31,7 +31,8 @@ struct DeviceAllocationInfo { DeviceAllocationType type_; }; -struct DevivePointerInfo { +struct DevicePointerInfo { + u64 type; uptr map_beg; uptr map_size; }; @@ -165,7 +166,7 @@ class DeviceAllocatorT { : nullptr; } - void *GetBlockBegin(const void *ptr) const { + void* GetBlockBegin(const void* ptr) const { Header header; if (!mem_funcs_inited_) return nullptr; uptr p = reinterpret_cast(ptr); @@ -182,7 +183,7 @@ class DeviceAllocatorT { if (!nearest_chunk) return nullptr; if (p != nearest_chunk) { - Header *h = GetHeader(nearest_chunk, &header); + Header* h = GetHeader(nearest_chunk, &header); CHECK_GE(nearest_chunk, h->map_beg); CHECK_LT(nearest_chunk, h->map_beg + h->map_size); CHECK_LE(nearest_chunk, p); @@ -297,7 +298,7 @@ class DeviceAllocatorT { return mem_funcs_inited_; } - typedef DevivePointerInfo Header; + typedef DevicePointerInfo Header; Header *GetHeaderAnyPointer(uptr p, Header* h) const { CHECK(IsAligned(p, page_size_)); diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp index 020bfe52b5320..9cc6d5fcc4c1d 100644 --- a/compiler-rt/lib/xray/xray_init.cpp +++ b/compiler-rt/lib/xray/xray_init.cpp @@ -105,7 +105,7 @@ __xray_register_sleds(const XRaySledEntry *SledsBegin, } if (Verbosity()) - Report("Registering %d new functions!\n", SledMap.Functions); + Report("Registering %d new functions!\n", (int)SledMap.Functions); { SpinMutexLock Guard(&XRayInstrMapMutex); diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 3f97827874a70..9bf0c56c4521a 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -308,7 +308,8 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { return XRayPatchingStatus::NOT_INITIALIZED; if (Verbosity()) - Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries); + Report("Patching object %d with %d functions.\n", ObjId, + (int)InstrMap.Entries); // Check if the corresponding DSO has been unloaded. if (!InstrMap.Loaded) { diff --git a/flang/test/Driver/arch-specific-libdir-rpath.f95 b/flang/test/Driver/arch-specific-libdir-rpath.f95 index 23fb52abfbd57..15cb27e6926fd 100644 --- a/flang/test/Driver/arch-specific-libdir-rpath.f95 +++ b/flang/test/Driver/arch-specific-libdir-rpath.f95 @@ -32,8 +32,7 @@ ! ! ! RESDIR: "-resource-dir" "[[RESDIR:[^"]*]]" -! ! LIBPATH-X86_64: -L[[RESDIR]]{{(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}} -! RPATH-X86_64: "-rpath" "[[RESDIR]]{{(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}" ! -! NO-RPATH-X86_64-NOT: "-rpath" "[[RESDIR]]{{(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}" +! RPATH-X86_64: "-rpath" "{{[^"]*(/|\\\\)resource_dir_with_arch_subdir(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}" +! NO-RPATH-X86_64-NOT: "-rpath" "{{[^"]*(/|\\\\)resource_dir_with_arch_subdir(/|\\\\)lib(/|\\\\)linux(/|\\\\)x86_64}}" diff --git a/flang/test/Driver/flang-new-warning.f90 b/flang/test/Driver/flang-new-warning.f90 new file mode 100644 index 0000000000000..3d83c7ad3966c --- /dev/null +++ b/flang/test/Driver/flang-new-warning.f90 @@ -0,0 +1,3 @@ +! RUN: amdflang-new -c %s 2>&1 | FileCheck %s +! CHECK: warning: the 'amdflang-new' and 'flang-new' commmands have been deprecated; please use 'amdflang' instead +! XFAIL: * diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index 35d907fbe44d4..1c703f631ba0d 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -74,6 +74,12 @@ multiple file formats. For MachO objects, ``
`` must be formatted as ``,
``. +.. option:: --dump-offload-bundle= + + Dump the HIP Offload Bundle entry specified by the URI syntax given, into a + code object file. + + .. option:: --enable-deterministic-archives, -D Enable deterministic mode when copying archives, i.e. use 0 for archive member diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst index 8bd29eafbbfcf..faaddb4699f7d 100644 --- a/llvm/docs/CommandGuide/llvm-readobj.rst +++ b/llvm/docs/CommandGuide/llvm-readobj.rst @@ -104,6 +104,10 @@ file formats. Do not demangle symbol names in the output. This option is only for ELF and XCOFF file formats. The option is enabled by default. +.. option:: --offloading + + Display list of HIP Offload bundles using URI syntax. + .. option:: --relocations, --relocs, -r Display the relocation entries in the file. diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 962693003349e..2b7df71d78bda 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2986,7 +2986,7 @@ class AMDGPULoadIntrinsic: Intrinsic< [llvm_any_ty], [ptr_ty], - [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture>, IntrNoCallback, IntrNoFree], + [IntrReadMem, IntrArgMemOnly, IntrWillReturn, IntrConvergent, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand] >; diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h index faa7b0db757a3..6858a0dbf9298 100644 --- a/llvm/include/llvm/ObjCopy/CommonConfig.h +++ b/llvm/include/llvm/ObjCopy/CommonConfig.h @@ -277,6 +277,8 @@ struct CommonConfig { bool StripUnneeded = false; bool Weaken = false; bool DecompressDebugSections = false; + bool DumpOffloadBundle = false; + bool NeedPositional = true; DebugCompressionType CompressionType = DebugCompressionType::None; diff --git a/llvm/include/llvm/Object/OffloadBundle.h b/llvm/include/llvm/Object/OffloadBundle.h index f4d5a1d878b8d..7ced75073f8d2 100644 --- a/llvm/include/llvm/Object/OffloadBundle.h +++ b/llvm/include/llvm/Object/OffloadBundle.h @@ -32,29 +32,40 @@ namespace llvm { namespace object { +// CompressedOffloadBundle represents the format for the compressed offload +// bundles. +// +// The format is as follows: +// - Magic Number (4 bytes) - A constant "CCOB". +// - Version (2 bytes) +// - Compression Method (2 bytes) - Uses the values from +// llvm::compression::Format. +// - Total file size (4 bytes in V2, 8 bytes in V3). +// - Uncompressed Size (4 bytes in V1/V2, 8 bytes in V3). +// - Truncated MD5 Hash (8 bytes). +// - Compressed Data (variable length). class CompressedOffloadBundle { private: - static inline const size_t MagicSize = 4; - static inline const size_t VersionFieldSize = sizeof(uint16_t); - static inline const size_t MethodFieldSize = sizeof(uint16_t); - static inline const size_t FileSizeFieldSize = sizeof(uint32_t); - static inline const size_t UncompressedSizeFieldSize = sizeof(uint32_t); - static inline const size_t HashFieldSize = sizeof(uint64_t); - static inline const size_t V1HeaderSize = - MagicSize + VersionFieldSize + MethodFieldSize + - UncompressedSizeFieldSize + HashFieldSize; - static inline const size_t V2HeaderSize = - MagicSize + VersionFieldSize + FileSizeFieldSize + MethodFieldSize + - UncompressedSizeFieldSize + HashFieldSize; static inline const llvm::StringRef MagicNumber = "CCOB"; - static inline const uint16_t Version = 2; public: - LLVM_ABI static llvm::Expected> + struct CompressedBundleHeader { + unsigned Version; + llvm::compression::Format CompressionFormat; + std::optional FileSize; + size_t UncompressedFileSize; + uint64_t Hash; + + static llvm::Expected tryParse(llvm::StringRef); + }; + + static inline const uint16_t DefaultVersion = 2; + + static llvm::Expected> compress(llvm::compression::Params P, const llvm::MemoryBuffer &Input, - bool Verbose = false); - LLVM_ABI static llvm::Expected> - decompress(llvm::MemoryBufferRef &Input, bool Verbose = false); + uint16_t Version, bool Verbose = false); + static llvm::Expected> + decompress(const llvm::MemoryBuffer &Input, bool Verbose = false); }; /// Bundle entry in binary clang-offload-bundler format. @@ -62,12 +73,15 @@ struct OffloadBundleEntry { uint64_t Offset = 0u; uint64_t Size = 0u; uint64_t IDLength = 0u; - StringRef ID; - OffloadBundleEntry(uint64_t O, uint64_t S, uint64_t I, StringRef T) - : Offset(O), Size(S), IDLength(I), ID(T) {} + std::string ID; + OffloadBundleEntry(uint64_t O, uint64_t S, uint64_t I, std::string T) + : Offset(O), Size(S), IDLength(I) { + ID.reserve(T.size()); + ID = T; + } void dumpInfo(raw_ostream &OS) { OS << "Offset = " << Offset << ", Size = " << Size - << ", ID Length = " << IDLength << ", ID = " << ID; + << ", ID Length = " << IDLength << ", ID = " << ID << "\n"; } void dumpURI(raw_ostream &OS, StringRef FilePath) { OS << ID.data() << "\tfile://" << FilePath << "#offset=" << Offset @@ -82,15 +96,20 @@ class OffloadBundleFatBin { StringRef FileName; uint64_t NumberOfEntries; SmallVector Entries; + bool Decompressed; public: + std::unique_ptr DecompressedBuffer; + SmallVector getEntries() { return Entries; } uint64_t getSize() const { return Size; } StringRef getFileName() const { return FileName; } uint64_t getNumEntries() const { return NumberOfEntries; } + bool isDecompressed() const { return Decompressed; } LLVM_ABI static Expected> - create(MemoryBufferRef, uint64_t SectionOffset, StringRef FileName); + create(MemoryBufferRef, uint64_t SectionOffset, StringRef FileName, + bool Decompress = false); LLVM_ABI Error extractBundle(const ObjectFile &Source); LLVM_ABI Error dumpEntryToCodeObject(); @@ -106,9 +125,15 @@ class OffloadBundleFatBin { Entry.dumpURI(outs(), FileName); } - OffloadBundleFatBin(MemoryBufferRef Source, StringRef File) - : FileName(File), NumberOfEntries(0), - Entries(SmallVector()) {} + OffloadBundleFatBin(MemoryBufferRef Source, StringRef File, + bool Decompress = false) + : FileName(File), Decompressed(Decompress), NumberOfEntries(0), + Entries(SmallVector()) { + if (Decompress) { + DecompressedBuffer = + MemoryBuffer::getMemBufferCopy(Source.getBuffer(), File); + } + } }; enum UriTypeT { FILE_URI, MEMORY_URI }; @@ -161,7 +186,7 @@ struct OffloadBundleURI { OffsetStr.getAsInteger(10, O); Str = Str.drop_front(OffsetStr.size()); - if (Str.consume_front("&size=")) + if (!Str.consume_front("&size=")) return createStringError(object_error::parse_failed, "Reading 'size' in URI"); @@ -191,6 +216,10 @@ LLVM_ABI Error extractOffloadBundleFatBinary( LLVM_ABI Error extractCodeObject(const ObjectFile &Source, int64_t Offset, int64_t Size, StringRef OutputFileName); +/// Extract code object memory from the given \p Source object file at \p Offset +/// and of \p Size, and copy into \p OutputFileName. +LLVM_ABI Error extractCodeObject(MemoryBufferRef Buffer, int64_t Offset, + int64_t Size, StringRef OutputFileName); /// Extracts an Offload Bundle Entry given by URI LLVM_ABI Error extractOffloadBundleByURI(StringRef URIstr); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9bbb89e37865d..13a4865514ec8 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -3527,6 +3528,50 @@ class TypePromotionTransaction { LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy << "\n"); Inst->mutateType(NewTy); + // Handle debug Info + mutateDgbInfo(Inst, NewTy); + } + + void mutateDgbInfo(Instruction *I, Type *Ty) { + SmallVector DVIs; + SmallVector Dbgs; + findDbgUsers(DVIs, I, &Dbgs); + for (DbgVariableRecord *Dbg : Dbgs) { + DIExpression *Expr = Dbg->getExpression(); + if (!Expr) + continue; + std::optional Elems = + Expr->getNewElementsRef(); + if (!Elems.has_value()) + continue; + // Collect arg of Inst + uint32_t Idx = 0; + SmallBitVector Idxs(Dbg->getNumVariableLocationOps()); + for (auto *VMD : Dbg->location_ops()) { + if (VMD == I) { + Idxs.set(Idx); + } + Idx++; + } + // Replace types + DIExprBuilder Builder(Expr->getContext()); + unsigned long ArgI = 0; + for (auto [I, Op] : enumerate(*Elems)) { + const DIOp::Arg *AsArg = std::get_if(&Op); + const DIOp::Convert *CvtArg = std::get_if(&Op); + if (AsArg && Idxs[AsArg->getIndex()]) { + ArgI = I; + Builder.append(AsArg->getIndex(), Ty); + if (Ty != OrigTy) + Builder.append(OrigTy); + } else if (!(CvtArg && I == ArgI + 1 && + CvtArg->getResultType() == Ty)) { + Builder.append(Op); + } + I++; + } + Dbg->setExpression(Builder.intoExpression()); + } } /// Mutate the instruction back to its original type. @@ -3534,6 +3579,8 @@ class TypePromotionTransaction { LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy << "\n"); Inst->mutateType(OrigTy); + // Handle debug Info + mutateDgbInfo(Inst, OrigTy); } }; diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp index f810bbf639300..5557427bc78f9 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp @@ -210,6 +210,27 @@ static Error dumpSectionToFile(StringRef SecName, StringRef Filename, "section '%s' not found", SecName.str().c_str()); } +static Error dumpRawDataURIToFile(StringRef Filename, int64_t Offset, + int64_t Size, ObjectFile &Obj) { + SmallString<2048> NameBuf; + raw_svector_ostream OutputFileName(NameBuf); + OutputFileName << Obj.getFileName().str() << "-offset" << Offset << "-size" + << Size << ".co"; + + Expected> BufferOrErr = + FileOutputBuffer::create(OutputFileName.str(), Size); + + if (!BufferOrErr) + return BufferOrErr.takeError(); + + MemoryBufferRef Input = Obj.getMemoryBufferRef(); + std::unique_ptr Buf = std::move(*BufferOrErr); + std::copy(Input.getBufferStart(), Input.getBufferStart() + Size, + Buf->getBufferStart()); + + return Buf->commit(); +} + Error Object::compressOrDecompressSections(const CommonConfig &Config) { // Build a list of sections we are going to replace. // We can't call `addSection` while iterating over sections, diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp index 1e1042ce2bc21..57a8244a9b0e5 100644 --- a/llvm/lib/Object/OffloadBundle.cpp +++ b/llvm/lib/Object/OffloadBundle.cpp @@ -37,26 +37,63 @@ Error extractOffloadBundle(MemoryBufferRef Contents, uint64_t SectionOffset, size_t Offset = 0; size_t NextbundleStart = 0; + StringRef Magic; + std::unique_ptr Buffer; // There could be multiple offloading bundles stored at this section. - while (NextbundleStart != StringRef::npos) { - std::unique_ptr Buffer = + while ((NextbundleStart != StringRef::npos) && + (Offset < Contents.getBuffer().size())) { + Buffer = MemoryBuffer::getMemBuffer(Contents.getBuffer().drop_front(Offset), "", /*RequiresNullTerminator=*/false); - // Create the FatBinBindle object. This will also create the Bundle Entry - // list info. - auto FatBundleOrErr = - OffloadBundleFatBin::create(*Buffer, SectionOffset + Offset, FileName); - if (!FatBundleOrErr) - return FatBundleOrErr.takeError(); - - // Add current Bundle to list. - Bundles.emplace_back(std::move(**FatBundleOrErr)); + if (identify_magic((*Buffer).getBuffer()) == + file_magic::offload_bundle_compressed) { + Magic = StringRef("CCOB"); + // decompress this bundle first. + NextbundleStart = (*Buffer).getBuffer().find(Magic, Magic.size()); + if (NextbundleStart == StringRef::npos) { + NextbundleStart = (*Buffer).getBuffer().size(); + } - // Find the next bundle by searching for the magic string - StringRef Str = Buffer->getBuffer(); - NextbundleStart = Str.find(StringRef("__CLANG_OFFLOAD_BUNDLE__"), 24); + ErrorOr> CodeOrErr = + MemoryBuffer::getMemBuffer((*Buffer).getBuffer().take_front( + NextbundleStart /*- Magic.size()*/), + FileName, false); + if (std::error_code EC = CodeOrErr.getError()) + return createFileError(FileName, EC); + + Expected> DecompressedBufferOrErr = + CompressedOffloadBundle::decompress(**CodeOrErr, false); + if (!DecompressedBufferOrErr) + return createStringError( + inconvertibleErrorCode(), + "Failed to decompress input: " + + llvm::toString(DecompressedBufferOrErr.takeError())); + + auto FatBundleOrErr = OffloadBundleFatBin::create( + **DecompressedBufferOrErr, Offset, FileName, true); + if (!FatBundleOrErr) + return FatBundleOrErr.takeError(); + + // Add current Bundle to list. + Bundles.emplace_back(std::move(**FatBundleOrErr)); + + } else if (identify_magic((*Buffer).getBuffer()) == + file_magic::offload_bundle) { + // Create the FatBinBindle object. This will also create the Bundle Entry + // list info. + auto FatBundleOrErr = OffloadBundleFatBin::create( + *Buffer, SectionOffset + Offset, FileName); + if (!FatBundleOrErr) + return FatBundleOrErr.takeError(); + + // Add current Bundle to list. + Bundles.emplace_back(std::move(**FatBundleOrErr)); + + Magic = StringRef("__CLANG_OFFLOAD_BUNDLE__"); + NextbundleStart = (*Buffer).getBuffer().find(Magic, Magic.size()); + } if (NextbundleStart != StringRef::npos) Offset += NextbundleStart; @@ -102,7 +139,8 @@ Error OffloadBundleFatBin::readEntries(StringRef Buffer, return errorCodeToError(object_error::parse_failed); auto Entry = std::make_unique( - EntryOffset + SectionOffset, EntrySize, EntryIDSize, EntryID); + EntryOffset + SectionOffset, EntrySize, EntryIDSize, + std::move(EntryID.str())); Entries.push_back(*Entry); } @@ -112,18 +150,22 @@ Error OffloadBundleFatBin::readEntries(StringRef Buffer, Expected> OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset, - StringRef FileName) { + StringRef FileName, bool Decompress) { if (Buf.getBufferSize() < 24) return errorCodeToError(object_error::parse_failed); // Check for magic bytes. - if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle) + if ((identify_magic(Buf.getBuffer()) != file_magic::offload_bundle) && + (identify_magic(Buf.getBuffer()) != + file_magic::offload_bundle_compressed)) return errorCodeToError(object_error::parse_failed); - OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName); + OffloadBundleFatBin *TheBundle = + new OffloadBundleFatBin(Buf, FileName, Decompress); // Read the Bundle Entries - Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset); + Error Err = + TheBundle->readEntries(Buf.getBuffer(), Decompress ? 0 : SectionOffset); if (Err) return errorCodeToError(object_error::parse_failed); @@ -172,28 +214,9 @@ Error object::extractOffloadBundleFatBinary( "COFF object files not supported.\n"); MemoryBufferRef Contents(*Buffer, Obj.getFileName()); - - if (llvm::identify_magic(*Buffer) == - llvm::file_magic::offload_bundle_compressed) { - // Decompress the input if necessary. - Expected> DecompressedBufferOrErr = - CompressedOffloadBundle::decompress(Contents, false); - - if (!DecompressedBufferOrErr) - return createStringError( - inconvertibleErrorCode(), - "Failed to decompress input: " + - llvm::toString(DecompressedBufferOrErr.takeError())); - - MemoryBuffer &DecompressedInput = **DecompressedBufferOrErr; - if (Error Err = extractOffloadBundle(DecompressedInput, SectionOffset, - Obj.getFileName(), Bundles)) - return Err; - } else { - if (Error Err = extractOffloadBundle(Contents, SectionOffset, - Obj.getFileName(), Bundles)) - return Err; - } + if (Error Err = extractOffloadBundle(Contents, SectionOffset, + Obj.getFileName(), Bundles)) + return Err; } } return Error::success(); @@ -221,6 +244,22 @@ Error object::extractCodeObject(const ObjectFile &Source, int64_t Offset, return Error::success(); } +Error object::extractCodeObject(const MemoryBufferRef Buffer, int64_t Offset, + int64_t Size, StringRef OutputFileName) { + Expected> BufferOrErr = + FileOutputBuffer::create(OutputFileName, Size); + if (!BufferOrErr) + return BufferOrErr.takeError(); + + std::unique_ptr Buf = std::move(*BufferOrErr); + std::copy(Buffer.getBufferStart() + Offset, + Buffer.getBufferStart() + Offset + Size, Buf->getBufferStart()); + if (Error E = Buf->commit()) + return E; + + return Error::success(); +} + // given a file name, offset, and size, extract data into a code object file, // into file -offset-size.co Error object::extractOffloadBundleByURI(StringRef URIstr) { @@ -260,11 +299,233 @@ static std::string formatWithCommas(unsigned long long Value) { } llvm::Expected> -CompressedOffloadBundle::decompress(llvm::MemoryBufferRef &Input, +CompressedOffloadBundle::compress(llvm::compression::Params P, + const llvm::MemoryBuffer &Input, + uint16_t Version, bool Verbose) { + if (!llvm::compression::zstd::isAvailable() && + !llvm::compression::zlib::isAvailable()) + return createStringError(llvm::inconvertibleErrorCode(), + "Compression not supported"); + llvm::Timer HashTimer("Hash Calculation Timer", "Hash calculation time", + OffloadBundlerTimerGroup); + if (Verbose) + HashTimer.startTimer(); + llvm::MD5 Hash; + llvm::MD5::MD5Result Result; + Hash.update(Input.getBuffer()); + Hash.final(Result); + uint64_t TruncatedHash = Result.low(); + if (Verbose) + HashTimer.stopTimer(); + + SmallVector CompressedBuffer; + auto BufferUint8 = llvm::ArrayRef( + reinterpret_cast(Input.getBuffer().data()), + Input.getBuffer().size()); + llvm::Timer CompressTimer("Compression Timer", "Compression time", + OffloadBundlerTimerGroup); + if (Verbose) + CompressTimer.startTimer(); + llvm::compression::compress(P, BufferUint8, CompressedBuffer); + if (Verbose) + CompressTimer.stopTimer(); + + uint16_t CompressionMethod = static_cast(P.format); + + // Store sizes in 64-bit variables first + uint64_t UncompressedSize64 = Input.getBuffer().size(); + uint64_t TotalFileSize64; + + // Calculate total file size based on version + if (Version == 2) { + // For V2, ensure the sizes don't exceed 32-bit limit + if (UncompressedSize64 > std::numeric_limits::max()) + return createStringError(llvm::inconvertibleErrorCode(), + "Uncompressed size exceeds version 2 limit"); + if ((MagicNumber.size() + sizeof(uint32_t) + sizeof(Version) + + sizeof(CompressionMethod) + sizeof(uint32_t) + sizeof(TruncatedHash) + + CompressedBuffer.size()) > std::numeric_limits::max()) + return createStringError(llvm::inconvertibleErrorCode(), + "Total file size exceeds version 2 limit"); + + TotalFileSize64 = MagicNumber.size() + sizeof(uint32_t) + sizeof(Version) + + sizeof(CompressionMethod) + sizeof(uint32_t) + + sizeof(TruncatedHash) + CompressedBuffer.size(); + } else { // Version 3 + TotalFileSize64 = MagicNumber.size() + sizeof(uint64_t) + sizeof(Version) + + sizeof(CompressionMethod) + sizeof(uint64_t) + + sizeof(TruncatedHash) + CompressedBuffer.size(); + } + + SmallVector FinalBuffer; + llvm::raw_svector_ostream OS(FinalBuffer); + OS << MagicNumber; + OS.write(reinterpret_cast(&Version), sizeof(Version)); + OS.write(reinterpret_cast(&CompressionMethod), + sizeof(CompressionMethod)); + + // Write size fields according to version + if (Version == 2) { + uint32_t TotalFileSize32 = static_cast(TotalFileSize64); + uint32_t UncompressedSize32 = static_cast(UncompressedSize64); + OS.write(reinterpret_cast(&TotalFileSize32), + sizeof(TotalFileSize32)); + OS.write(reinterpret_cast(&UncompressedSize32), + sizeof(UncompressedSize32)); + } else { // Version 3 + OS.write(reinterpret_cast(&TotalFileSize64), + sizeof(TotalFileSize64)); + OS.write(reinterpret_cast(&UncompressedSize64), + sizeof(UncompressedSize64)); + } + + OS.write(reinterpret_cast(&TruncatedHash), + sizeof(TruncatedHash)); + OS.write(reinterpret_cast(CompressedBuffer.data()), + CompressedBuffer.size()); + + if (Verbose) { + auto MethodUsed = + P.format == llvm::compression::Format::Zstd ? "zstd" : "zlib"; + double CompressionRate = + static_cast(UncompressedSize64) / CompressedBuffer.size(); + double CompressionTimeSeconds = CompressTimer.getTotalTime().getWallTime(); + double CompressionSpeedMBs = + (UncompressedSize64 / (1024.0 * 1024.0)) / CompressionTimeSeconds; + llvm::errs() << "Compressed bundle format version: " << Version << "\n" + << "Total file size (including headers): " + << formatWithCommas(TotalFileSize64) << " bytes\n" + << "Compression method used: " << MethodUsed << "\n" + << "Compression level: " << P.level << "\n" + << "Binary size before compression: " + << formatWithCommas(UncompressedSize64) << " bytes\n" + << "Binary size after compression: " + << formatWithCommas(CompressedBuffer.size()) << " bytes\n" + << "Compression rate: " + << llvm::format("%.2lf", CompressionRate) << "\n" + << "Compression ratio: " + << llvm::format("%.2lf%%", 100.0 / CompressionRate) << "\n" + << "Compression speed: " + << llvm::format("%.2lf MB/s", CompressionSpeedMBs) << "\n" + << "Truncated MD5 hash: " + << llvm::format_hex(TruncatedHash, 16) << "\n"; + } + + return llvm::MemoryBuffer::getMemBufferCopy( + llvm::StringRef(FinalBuffer.data(), FinalBuffer.size())); +} + +// Use packed structs to avoid padding, such that the structs map the serialized +// format. +LLVM_PACKED_START +union RawCompressedBundleHeader { + struct CommonFields { + uint32_t Magic; + uint16_t Version; + uint16_t Method; + }; + + struct V1Header { + CommonFields Common; + uint32_t UncompressedFileSize; + uint64_t Hash; + }; + + struct V2Header { + CommonFields Common; + uint32_t FileSize; + uint32_t UncompressedFileSize; + uint64_t Hash; + }; + + struct V3Header { + CommonFields Common; + uint64_t FileSize; + uint64_t UncompressedFileSize; + uint64_t Hash; + }; + + CommonFields Common; + V1Header V1; + V2Header V2; + V3Header V3; +}; +LLVM_PACKED_END + +// Helper method to get header size based on version +static size_t getHeaderSize(uint16_t Version) { + switch (Version) { + case 1: + return sizeof(RawCompressedBundleHeader::V1Header); + case 2: + return sizeof(RawCompressedBundleHeader::V2Header); + case 3: + return sizeof(RawCompressedBundleHeader::V3Header); + default: + llvm_unreachable("Unsupported version"); + } +} + +Expected +CompressedOffloadBundle::CompressedBundleHeader::tryParse(StringRef Blob) { + assert(Blob.size() >= sizeof(RawCompressedBundleHeader::CommonFields)); + assert(llvm::identify_magic(Blob) == + llvm::file_magic::offload_bundle_compressed); + + RawCompressedBundleHeader Header; + memcpy(&Header, Blob.data(), std::min(Blob.size(), sizeof(Header))); + + CompressedBundleHeader Normalized; + Normalized.Version = Header.Common.Version; + + size_t RequiredSize = getHeaderSize(Normalized.Version); + + if (Blob.size() < RequiredSize) + return createStringError(inconvertibleErrorCode(), + "Compressed bundle header size too small"); + + switch (Normalized.Version) { + case 1: + Normalized.UncompressedFileSize = Header.V1.UncompressedFileSize; + Normalized.Hash = Header.V1.Hash; + break; + case 2: + Normalized.FileSize = Header.V2.FileSize; + Normalized.UncompressedFileSize = Header.V2.UncompressedFileSize; + Normalized.Hash = Header.V2.Hash; + break; + case 3: + Normalized.FileSize = Header.V3.FileSize; + Normalized.UncompressedFileSize = Header.V3.UncompressedFileSize; + Normalized.Hash = Header.V3.Hash; + break; + default: + return createStringError(inconvertibleErrorCode(), + "Unknown compressed bundle version"); + } + + // Determine compression format + switch (Header.Common.Method) { + case static_cast(compression::Format::Zlib): + case static_cast(compression::Format::Zstd): + Normalized.CompressionFormat = + static_cast(Header.Common.Method); + break; + default: + return createStringError(inconvertibleErrorCode(), + "Unknown compressing method"); + } + + return Normalized; +} + +llvm::Expected> +CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input, bool Verbose) { StringRef Blob = Input.getBuffer(); - if (Blob.size() < V1HeaderSize) + // Check minimum header size (using V1 as it's the smallest) + if (Blob.size() < sizeof(RawCompressedBundleHeader::CommonFields)) return llvm::MemoryBuffer::getMemBufferCopy(Blob); if (llvm::identify_magic(Blob) != @@ -274,43 +535,20 @@ CompressedOffloadBundle::decompress(llvm::MemoryBufferRef &Input, return llvm::MemoryBuffer::getMemBufferCopy(Blob); } - size_t CurrentOffset = MagicSize; - - uint16_t ThisVersion; - memcpy(&ThisVersion, Blob.data() + CurrentOffset, sizeof(uint16_t)); - CurrentOffset += VersionFieldSize; + Expected HeaderOrErr = + CompressedBundleHeader::tryParse(Blob); + if (!HeaderOrErr) + return HeaderOrErr.takeError(); - uint16_t CompressionMethod; - memcpy(&CompressionMethod, Blob.data() + CurrentOffset, sizeof(uint16_t)); - CurrentOffset += MethodFieldSize; + const CompressedBundleHeader &Normalized = *HeaderOrErr; + unsigned ThisVersion = Normalized.Version; + size_t HeaderSize = getHeaderSize(ThisVersion); - uint32_t TotalFileSize; - if (ThisVersion >= 2) { - if (Blob.size() < V2HeaderSize) - return createStringError(inconvertibleErrorCode(), - "Compressed bundle header size too small"); - memcpy(&TotalFileSize, Blob.data() + CurrentOffset, sizeof(uint32_t)); - CurrentOffset += FileSizeFieldSize; - } + llvm::compression::Format CompressionFormat = Normalized.CompressionFormat; - uint32_t UncompressedSize; - memcpy(&UncompressedSize, Blob.data() + CurrentOffset, sizeof(uint32_t)); - CurrentOffset += UncompressedSizeFieldSize; - - uint64_t StoredHash; - memcpy(&StoredHash, Blob.data() + CurrentOffset, sizeof(uint64_t)); - CurrentOffset += HashFieldSize; - - llvm::compression::Format CompressionFormat; - if (CompressionMethod == - static_cast(llvm::compression::Format::Zlib)) - CompressionFormat = llvm::compression::Format::Zlib; - else if (CompressionMethod == - static_cast(llvm::compression::Format::Zstd)) - CompressionFormat = llvm::compression::Format::Zstd; - else - return createStringError(inconvertibleErrorCode(), - "Unknown compressing method"); + size_t TotalFileSize = Normalized.FileSize.value_or(0); + size_t UncompressedSize = Normalized.UncompressedFileSize; + auto StoredHash = Normalized.Hash; llvm::Timer DecompressTimer("Decompression Timer", "Decompression time", OffloadBundlerTimerGroup); @@ -318,7 +556,9 @@ CompressedOffloadBundle::decompress(llvm::MemoryBufferRef &Input, DecompressTimer.startTimer(); SmallVector DecompressedData; - StringRef CompressedData = Blob.substr(CurrentOffset); + StringRef CompressedData = + Blob.substr(HeaderSize, TotalFileSize - HeaderSize); + if (llvm::Error DecompressionError = llvm::compression::decompress( CompressionFormat, llvm::arrayRefFromStringRef(CompressedData), DecompressedData, UncompressedSize)) @@ -332,7 +572,7 @@ CompressedOffloadBundle::decompress(llvm::MemoryBufferRef &Input, double DecompressionTimeSeconds = DecompressTimer.getTotalTime().getWallTime(); - // Recalculate MD5 hash for integrity check. + // Recalculate MD5 hash for integrity check llvm::Timer HashRecalcTimer("Hash Recalculation Timer", "Hash recalculation time", OffloadBundlerTimerGroup); @@ -378,90 +618,3 @@ CompressedOffloadBundle::decompress(llvm::MemoryBufferRef &Input, return llvm::MemoryBuffer::getMemBufferCopy( llvm::toStringRef(DecompressedData)); } - -llvm::Expected> -CompressedOffloadBundle::compress(llvm::compression::Params P, - const llvm::MemoryBuffer &Input, - bool Verbose) { - if (!llvm::compression::zstd::isAvailable() && - !llvm::compression::zlib::isAvailable()) - return createStringError(llvm::inconvertibleErrorCode(), - "Compression not supported"); - - llvm::Timer HashTimer("Hash Calculation Timer", "Hash calculation time", - OffloadBundlerTimerGroup); - if (Verbose) - HashTimer.startTimer(); - llvm::MD5 Hash; - llvm::MD5::MD5Result Result; - Hash.update(Input.getBuffer()); - Hash.final(Result); - uint64_t TruncatedHash = Result.low(); - if (Verbose) - HashTimer.stopTimer(); - - SmallVector CompressedBuffer; - auto BufferUint8 = llvm::ArrayRef( - reinterpret_cast(Input.getBuffer().data()), - Input.getBuffer().size()); - - llvm::Timer CompressTimer("Compression Timer", "Compression time", - OffloadBundlerTimerGroup); - if (Verbose) - CompressTimer.startTimer(); - llvm::compression::compress(P, BufferUint8, CompressedBuffer); - if (Verbose) - CompressTimer.stopTimer(); - - uint16_t CompressionMethod = static_cast(P.format); - uint32_t UncompressedSize = Input.getBuffer().size(); - uint32_t TotalFileSize = MagicNumber.size() + sizeof(TotalFileSize) + - sizeof(Version) + sizeof(CompressionMethod) + - sizeof(UncompressedSize) + sizeof(TruncatedHash) + - CompressedBuffer.size(); - - SmallVector FinalBuffer; - llvm::raw_svector_ostream OS(FinalBuffer); - OS << MagicNumber; - OS.write(reinterpret_cast(&Version), sizeof(Version)); - OS.write(reinterpret_cast(&CompressionMethod), - sizeof(CompressionMethod)); - OS.write(reinterpret_cast(&TotalFileSize), - sizeof(TotalFileSize)); - OS.write(reinterpret_cast(&UncompressedSize), - sizeof(UncompressedSize)); - OS.write(reinterpret_cast(&TruncatedHash), - sizeof(TruncatedHash)); - OS.write(reinterpret_cast(CompressedBuffer.data()), - CompressedBuffer.size()); - - if (Verbose) { - auto MethodUsed = - P.format == llvm::compression::Format::Zstd ? "zstd" : "zlib"; - double CompressionRate = - static_cast(UncompressedSize) / CompressedBuffer.size(); - double CompressionTimeSeconds = CompressTimer.getTotalTime().getWallTime(); - double CompressionSpeedMBs = - (UncompressedSize / (1024.0 * 1024.0)) / CompressionTimeSeconds; - - llvm::errs() << "Compressed bundle format version: " << Version << "\n" - << "Total file size (including headers): " - << formatWithCommas(TotalFileSize) << " bytes\n" - << "Compression method used: " << MethodUsed << "\n" - << "Compression level: " << P.level << "\n" - << "Binary size before compression: " - << formatWithCommas(UncompressedSize) << " bytes\n" - << "Binary size after compression: " - << formatWithCommas(CompressedBuffer.size()) << " bytes\n" - << "Compression rate: " - << llvm::format("%.2lf", CompressionRate) << "\n" - << "Compression ratio: " - << llvm::format("%.2lf%%", 100.0 / CompressionRate) << "\n" - << "Compression speed: " - << llvm::format("%.2lf MB/s", CompressionSpeedMBs) << "\n" - << "Truncated MD5 hash: " - << llvm::format_hex(TruncatedHash, 16) << "\n"; - } - return llvm::MemoryBuffer::getMemBufferCopy( - llvm::StringRef(FinalBuffer.data(), FinalBuffer.size())); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 7ca4649639bb1..f6412ade4b29f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -127,10 +127,8 @@ static bool isDSAddress(const Constant *C) { return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; } -/// Returns true if the function requires the implicit argument be passed -/// regardless of the function contents. -static bool funcRequiresHostcallPtr(const Function &F) { - // Sanitizers require the hostcall buffer passed in the implicit arguments. +/// Returns true if sanitizer attributes are present on a function. +static bool hasSanitizerAttributes(const Function &F) { return F.hasFnAttribute(Attribute::SanitizeAddress) || F.hasFnAttribute(Attribute::SanitizeThread) || F.hasFnAttribute(Attribute::SanitizeMemory) || @@ -465,15 +463,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // If the function requires the implicit arg pointer due to sanitizers, // assume it's needed even if explicitly marked as not requiring it. - const bool NeedsHostcall = funcRequiresHostcallPtr(*F); - if (NeedsHostcall) { + // Flat scratch initialization is needed because `asan_malloc_impl` + // calls introduced later in pipeline will have flat scratch accesses. + // FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs + // implementation for `asan_malloc_impl` is updated. + const bool HasSanitizerAttrs = hasSanitizerAttributes(*F); + if (HasSanitizerAttrs) { removeAssumedBits(IMPLICIT_ARG_PTR); removeAssumedBits(HOSTCALL_PTR); + removeAssumedBits(FLAT_SCRATCH_INIT); } for (auto Attr : ImplicitAttrs) { - if (NeedsHostcall && - (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) + if (HasSanitizerAttrs && + (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR || + Attr.first == FLAT_SCRATCH_INIT)) continue; if (F->hasFnAttribute(Attr.second)) @@ -1299,74 +1303,6 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; -/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute -/// based on the finalized 'amdgpu-flat-work-group-size' attribute. -/// Both attributes start with narrow ranges that expand during iteration. -/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range, -/// preventing optimal updates later. Therefore, waves-per-eu can't be updated -/// with intermediate values during the attributor run. We defer the -/// finalization of waves-per-eu until after the flat-workgroup-size is -/// finalized. -/// TODO: Remove this and move similar logic back into the attributor run once -/// we have a better representation for waves-per-eu. -static bool updateWavesPerEU(Module &M, TargetMachine &TM) { - bool Changed = false; - - LLVMContext &Ctx = M.getContext(); - - for (Function &F : M) { - if (F.isDeclaration()) - continue; - - const GCNSubtarget &ST = TM.getSubtarget(F); - - std::optional>> - FlatWgrpSizeAttr = - AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size"); - - unsigned MinWavesPerEU = ST.getMinWavesPerEU(); - unsigned MaxWavesPerEU = ST.getMaxWavesPerEU(); - - unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize(); - unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize(); - if (FlatWgrpSizeAttr.has_value()) { - MinFlatWgrpSize = FlatWgrpSizeAttr->first; - MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second); - } - - // Start with the "best" range. - unsigned Min = MinWavesPerEU; - unsigned Max = MinWavesPerEU; - - // Compute the range from flat workgroup size. `getWavesPerEU` will also - // account for the 'amdgpu-waves-er-eu' attribute. - auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] = - ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize}); - - // For the lower bound, we have to "tighten" it. - Min = std::max(Min, MinFromFlatWgrpSize); - // For the upper bound, we have to "extend" it. - Max = std::max(Max, MaxFromFlatWgrpSize); - - // Clamp the range to the max range. - Min = std::max(Min, MinWavesPerEU); - Max = std::min(Max, MaxWavesPerEU); - - // Update the attribute if it is not the max. - if (Min != MinWavesPerEU || Max != MaxWavesPerEU) { - SmallString<10> Buffer; - raw_svector_ostream OS(Buffer); - OS << Min << ',' << Max; - Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu"); - Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str()); - F.addFnAttr(NewAttr); - Changed |= OldAttr == NewAttr; - } - } - - return Changed; -} - static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options, ThinOrFullLTOPhase LTOPhase) { @@ -1447,11 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, } } - bool Changed = A.run() == ChangeStatus::CHANGED; - - Changed |= updateWavesPerEU(M, TM); - - return Changed; + return A.run() == ChangeStatus::CHANGED; } } // namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ea79c57080faa..dd968782cc6f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -6675,13 +6675,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6690,13 +6690,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -6719,14 +6719,15 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a5f2ec6f6e558..3fc764134f7b9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -647,7 +647,8 @@ class AMDGPULowerModuleLDS { ModuleScopeVariables.insert(GV); } else if (K.second.size() == 1) { KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + } else if (K.second == HybridModuleRootKernels && + set_is_subset(K.second, HybridModuleRootKernels)) { ModuleScopeVariables.insert(GV); } else { TableLookupVariables.insert(GV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 1749bb1dbfdfc..611c0ab5bf04a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -70,7 +70,7 @@ static cl::opt PromoteAllocaToVectorMaxRegs( "amdgpu-promote-alloca-to-vector-max-regs", cl::desc( "Maximum vector size (in 32b registers) to use when promoting alloca"), - cl::init(16)); + cl::init(32)); // Use up to 1/4 of available register budget for vectorization. // FIXME: Increase the limit for whole function budgets? Perhaps x2? @@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { // Load per function limits, overriding with global options where appropriate. + // R600 register tuples/aliasing are fragile with large vector promotions so + // apply architecture specific limit here. + const int R600MaxVectorRegs = 16; MaxVectorRegs = F.getFnAttributeAsParsedInteger( - "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs); + "amdgpu-promote-alloca-to-vector-max-regs", + IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs); if (PromoteAllocaToVectorMaxRegs.getNumOccurrences()) MaxVectorRegs = PromoteAllocaToVectorMaxRegs; VGPRBudgetRatio = F.getFnAttributeAsParsedInteger( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d095fc6cf9549..73a73d56ef232 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -216,15 +216,6 @@ AMDGPUSubtarget::getWavesPerEU(const Function &F) const { return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); } -std::pair AMDGPUSubtarget::getWavesPerEU( - const Function &F, std::pair FlatWorkGroupSizes) const { - // Minimum number of bytes allocated in the LDS. - unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", - {0, UINT32_MAX}, true) - .first; - return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); -} - std::pair AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, unsigned LDSBytes, const Function &F) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 24f4df2aff9d1..b6dfdbe6e7749 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -347,7 +347,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { if (Opcode == Instruction::Load || Opcode == Instruction::Store) return 32 * 4 / ElemWidth; - // For a given width return the max 0number of elements that can be combined + + // For a given width return the max number of elements that can be combined // into a wider bit value: return (ElemWidth == 8 && ST->has16BitInsts()) ? 4 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2 @@ -1266,6 +1267,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I, if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) Ops.push_back(&Op); + + // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors) + // will be optimized away, and sinking them can help SDAG combines. + DataLayout DL = I->getModule()->getDataLayout(); + auto IsFreeExtractInsert = [&DL, this](VectorType *VecType, + unsigned VecIndex) { + unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType()); + return EltSize >= 32 || + (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts()); + }; + + uint64_t VecIndex; + Value *Vec; + if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) { + Instruction *VecOpInst = + dyn_cast(cast(Op.get())->getOperand(0)); + // If a zero cost extractvector instruction is the only use of the vector, + // then it may be combined with the def. + if (VecOpInst && VecOpInst->hasOneUse()) + continue; + + if (IsFreeExtractInsert(cast(Vec->getType()), VecIndex)) + Ops.push_back(&Op); + + continue; + } + + if (match(Op.get(), + m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) { + if (IsFreeExtractInsert(cast(Vec->getType()), VecIndex)) + Ops.push_back(&Op); + + continue; + } + + if (auto *Shuffle = dyn_cast(Op.get())) { + if (Shuffle->isIdentity()) { + Ops.push_back(&Op); + continue; + } + + unsigned EltSize = DL.getTypeSizeInBits( + cast(cast(Shuffle->getType())) + ->getElementType()); + + // For i32 (or greater) shufflevectors, these will be lowered into a + // series of insert / extract elements, which will be coalesced away. + if (EltSize >= 32) { + Ops.push_back(&Op); + continue; + } + + if (EltSize < 16 || !ST->has16BitInsts()) + continue; + + int NumSubElts, SubIndex; + if (Shuffle->changesLength()) { + if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) { + Ops.push_back(&Op); + continue; + } + + if (Shuffle->isExtractSubvectorMask(SubIndex) || + Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) { + if (!(SubIndex % 2)) { + Ops.push_back(&Op); + continue; + } + } + } + + if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() || + Shuffle->isSingleSource()) { + Ops.push_back(&Op); + continue; + } + + if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) { + if (!(SubIndex % 2)) { + Ops.push_back(&Op); + continue; + } + } + } } return !Ops.empty(); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7d6723a6108be..334afd3a2a5b4 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI) { - return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR); + return STI->isSGPRClass(RC) + ? SGPR + : (STI->isAGPRClass(RC) + ? AGPR + : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR)); } void GCNRegPressure::inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 3749b6d1efc63..ea33a229110c1 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -29,43 +29,57 @@ class raw_ostream; class SlotIndex; struct GCNRegPressure { - enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS }; + enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; GCNRegPressure() { clear(); } - bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; } + bool empty() const { + return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; + } void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p - /// UnifiedVGPRFile + /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure + /// dependent upon \p UnifiedVGPRFile unsigned getVGPRNum(bool UnifiedVGPRFile) const { if (UnifiedVGPRFile) { - return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR]) - : Value[VGPR]; + return Value[AGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) + : Value[VGPR] + Value[AVGPR]; } - return std::max(Value[VGPR], Value[AGPR]); + // AVGPR assignment priority is based on the width of the register. Account + // AVGPR pressure as VGPR. + return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs - /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file. + /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified + /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, - unsigned NumAGPRs) { - return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + + unsigned NumAGPRs, + unsigned NumAVGPRs) { + + // Assume AVGPRs will be assigned as VGPRs. + return alignTo(NumArchVGPRs + NumAVGPRs, + AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs; } - /// \returns the ArchVGPR32 pressure - unsigned getArchVGPRNum() const { return Value[VGPR]; } + /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be + /// allocated as VGPR + unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } /// \returns the AccVGPR32 pressure unsigned getAGPRNum() const { return Value[AGPR]; } + /// \returns the AVGPR32 pressure + unsigned getAVGPRNum() const { return Value[AVGPR]; } unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]); + return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], + Value[TOTAL_KINDS + AGPR]); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2a3d7bfc6ddc0..107146d024ab4 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1765,6 +1765,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); + SetVector ConstantFoldCandidates; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.Def.OpToFold); if (Fold.isReg() && Fold.getReg().isVirtual()) { @@ -1786,11 +1787,22 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI); + + if (Fold.isImm()) + ConstantFoldCandidates.insert(Fold.UseMI); + } else if (Fold.Commuted) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } + + for (MachineInstr *MI : ConstantFoldCandidates) { + if (tryConstantFoldOp(MI)) { + LLVM_DEBUG(dbgs() << "Constant folded " << *MI); + Changed = true; + } + } return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 3379bc06d2994..44fdff3c77362 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1191,6 +1191,7 @@ void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch // registers. However, save all lanes of callee-saved VGPRs. Due to this, we @@ -1219,6 +1220,12 @@ void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF, } }; + for (const Register Reg : make_first_range(WWMScratchRegs)) { + if (!MRI.isReserved(Reg)) { + MRI.addLiveIn(Reg); + MBB.addLiveIn(Reg); + } + } StoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6cf2055c8e565..eef4a29fe49ef 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1932,10 +1932,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || AddrSpace == AMDGPUAS::FLAT_ADDRESS) { bool AlignedBy4 = Alignment >= Align(4); + if (Subtarget->hasUnalignedScratchAccessEnabled()) { + if (IsFast) + *IsFast = AlignedBy4 ? Size : 1; + return true; + } + if (IsFast) *IsFast = AlignedBy4; - return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); + return AlignedBy4; } // So long as they are correct, wide global memory operations perform better @@ -17300,26 +17306,80 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { auto Op = RMW->getOperation(); switch (Op) { - case AtomicRMWInst::Xchg: { + case AtomicRMWInst::Xchg: // PCIe supports add and xchg for system atomics. return isAtomicRMWLegalXChgTy(RMW) ? TargetLowering::AtomicExpansionKind::None : TargetLowering::AtomicExpansionKind::CmpXChg; - } case AtomicRMWInst::Add: - case AtomicRMWInst::And: - case AtomicRMWInst::UIncWrap: - case AtomicRMWInst::UDecWrap: + // PCIe supports add and xchg for system atomics. return atomicSupportedIfLegalIntType(RMW); case AtomicRMWInst::Sub: + case AtomicRMWInst::And: case AtomicRMWInst::Or: - case AtomicRMWInst::Xor: { - // Atomic sub/or/xor do not work over PCI express, but atomic add - // does. InstCombine transforms these with 0 to or, so undo that. - if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { - if (Constant *ConstVal = dyn_cast(RMW->getValOperand()); - ConstVal && ConstVal->isNullValue()) - return AtomicExpansionKind::Expand; + case AtomicRMWInst::Xor: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: { + if (AMDGPU::isFlatGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { + // On most subtargets, for atomicrmw operations other than add/xchg, + // whether or not the instructions will behave correctly depends on where + // the address physically resides and what interconnect is used in the + // system configuration. On some some targets the instruction will nop, + // and in others synchronization will only occur at degraded device scope. + // + // If the allocation is known local to the device, the instructions should + // work correctly. + if (RMW->hasMetadata("amdgpu.no.remote.memory")) + return atomicSupportedIfLegalIntType(RMW); + + // If fine-grained remote memory works at device scope, we don't need to + // do anything. + if (!HasSystemScope && + Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics()) + return atomicSupportedIfLegalIntType(RMW); + + // If we are targeting a remote allocated address, it depends what kind of + // allocation the address belongs to. + // + // If the allocation is fine-grained (in host memory, or in PCIe peer + // device memory), the operation will fail depending on the target. + // + // Note fine-grained host memory access does work on APUs or if XGMI is + // used, but we do not know if we are targeting an APU or the system + // configuration from the ISA version/target-cpu. + if (RMW->hasMetadata("amdgpu.no.fine.grained.memory")) + return atomicSupportedIfLegalIntType(RMW); + + if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || + Op == AtomicRMWInst::Xor) { + // Atomic sub/or/xor do not work over PCI express, but atomic add + // does. InstCombine transforms these with 0 to or, so undo that. + if (Constant *ConstVal = dyn_cast(RMW->getValOperand()); + ConstVal && ConstVal->isNullValue()) + return AtomicExpansionKind::Expand; + } + + // If the allocation could be in remote, fine-grained memory, the rmw + // instructions may fail. cmpxchg should work, so emit that. On some + // system configurations, PCIe atomics aren't supported so cmpxchg won't + // even work, so you're out of luck anyway. + + // In summary: + // + // Cases that may fail: + // - fine-grained pinned host memory + // - fine-grained migratable host memory + // - fine-grained PCIe peer device + // + // Cases that should work, but may be treated overly conservatively. + // - fine-grained host memory on an APU + // - fine-grained XGMI peer device + return AtomicExpansionKind::CmpXChg; } return atomicSupportedIfLegalIntType(RMW); @@ -17474,19 +17534,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } - case AtomicRMWInst::Min: - case AtomicRMWInst::Max: - case AtomicRMWInst::UMin: - case AtomicRMWInst::UMax: { - if (AMDGPU::isFlatGlobalAddrSpace(AS) || - AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // Always expand system scope min/max atomics. - if (HasSystemScope) - return AtomicExpansionKind::CmpXChg; - } - - return atomicSupportedIfLegalIntType(RMW); - } case AtomicRMWInst::Nand: case AtomicRMWInst::FSub: default: diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da6..98f1ae508575d 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1918,13 +1918,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // LOAD_CNT is only relevant to vgpr or LDS. unsigned RegNo = FIRST_LDS_VGPR; - // Only objects with alias scope info were added to LDSDMAScopes array. - // In the absense of the scope info we will not be able to disambiguate - // aliasing here. There is no need to try searching for a corresponding - // store slot. This is conservatively correct because in that case we - // will produce a wait using the first (general) LDS DMA wait slot which - // will wait on all of them anyway. - if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { + if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { if (MI.mayAlias(AA, *LDSDMAStores[I], true)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index bfac25a9b7fdc..9244cbae0d21d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6150,10 +6150,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, !Op.isIdenticalTo(*MO)) return false; - // Do not fold a frame index into an instruction that already has a frame - // index. The frame index handling code doesn't handle fixing up operand - // constraints if there are multiple indexes. - if (Op.isFI() && MO->isFI()) + // Do not fold a non-inlineable and non-register operand into an + // instruction that already has a frame index. The frame index handling + // code could not handle well when a frame index co-exists with another + // non-register operand, unless that operand is an inlineable immediate. + if (Op.isFI()) return false; } } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() && @@ -6203,6 +6204,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } +bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { + bool IsGFX950Only = ST.hasGFX950Insts(); + bool IsGFX940Only = ST.hasGFX940Insts(); + + if (!IsGFX950Only && !IsGFX940Only) + return false; + + if (!isVALU(MI)) + return false; + + // V_COS, V_EXP, V_RCP, etc. + if (isTRANS(MI)) + return true; + + // DOT2, DOT2C, DOT4, etc. + if (isDOT(MI)) + return true; + + // MFMA, SMFMA + if (isMFMA(MI)) + return true; + + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_CVT_PK_BF8_F32_e64: + case AMDGPU::V_CVT_PK_FP8_F32_e64: + case AMDGPU::V_MQSAD_PK_U16_U8_e64: + case AMDGPU::V_MQSAD_U32_U8_e64: + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_I16: + case AMDGPU::V_PK_ADD_U16: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_FMA_F16: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMAC_F16_e32: + case AMDGPU::V_PK_FMAC_F16_e64: + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_MAD_I16: + case AMDGPU::V_PK_MAD_U16: + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_PK_MAX_I16: + case AMDGPU::V_PK_MAX_U16: + case AMDGPU::V_PK_MIN_F16: + case AMDGPU::V_PK_MIN_I16: + case AMDGPU::V_PK_MIN_U16: + case AMDGPU::V_PK_MOV_B32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_LO_U16: + case AMDGPU::V_PK_SUB_I16: + case AMDGPU::V_PK_SUB_U16: + case AMDGPU::V_QSAD_PK_U16_U8_e64: + return true; + default: + return false; + } +} + void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 8bc1089645f7d..b379ab9cb4d66 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1173,6 +1173,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; + bool isNeverCoissue(MachineInstr &MI) const; + + /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. + bool isLegalAV64PseudoImm(uint64_t Imm) const; /// Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f57d50e9610a4..db2d56df69bce 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1847,6 +1847,36 @@ foreach vt = Reg512Types.types in { } } +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; // 1024-bit bitcast foreach vt = Reg1024Types.types in { @@ -2488,6 +2518,7 @@ def : AMDGPUPat < >; let True16Predicate = NotHasTrue16BitInsts in { +let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern ; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), @@ -2497,6 +2528,35 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; +} // isNotGFX9Plus + +let SubtargetPredicate = isGFX9GFX10 in { +def : GCNPat < + (rotr i32:$src0, i32:$src1), + (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src0, + /* src2_modifiers */ 0, + $src1, /* clamp */ 0, /* op_sel */ 0) +>; + +foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), + (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in +def : GCNPat; + +def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), + (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src1, + /* src2_modifiers */ 0, + $src2, /* clamp */ 0, /* op_sel */ 0) +>; +} // isGFX9GFX10 } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { @@ -3097,6 +3157,8 @@ def : GCNPat < (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; +// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped +// to V_PERM_B32. let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (i32 (bswap i32:$a)), @@ -3572,15 +3634,20 @@ def : GCNPat < // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) -let True16Predicate = NotHasTrue16BitInsts in -def : GCNPat < +let True16Predicate = NotHasTrue16BitInsts in { +defvar BuildVectorToAlignBitPat = (vecTy (DivergentBinFrag (Ty !if(!eq(Ty, i16), (Ty (trunc (srl VGPR_32:$a, (i32 16)))), (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), - (Ty VGPR_32:$b))), - (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) ->; + (Ty VGPR_32:$b))); + +let SubtargetPredicate = isNotGFX9Plus in +def : GCNPat; + +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat; +} //True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseFakeTrue16Insts in def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 2c2ceedf8a2f6..4d3331ab353d3 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -9,11 +9,19 @@ /// \file /// This pass performs the peephole optimizations before code emission. /// +/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16, +/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be +/// co-issued. This helps with overlapping MFMA and certain vector instructions +/// in machine schedules and is expected to improve performance. Only those +/// packed instructions are unpacked that are overlapped by the MFMA latency. +/// Rest should remain untouched. +/// TODO: Add support for F16 packed instructions //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/BranchProbability.h" @@ -39,6 +47,37 @@ class SIPreEmitPeephole { const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); + // Check if the machine instruction being processed is a supported packed + // instruction. + bool isUnpackingSupportedInstr(MachineInstr &MI) const; + // Creates a list of packed instructions following an MFMA that are suitable + // for unpacking. + void collectUnpackingCandidates(MachineInstr &BeginMI, + SetVector &InstrsToUnpack, + uint16_t NumMFMACycles); + // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1] + // op_sel_hi:[0,0,0] + // ==> + // v_fma_f32 v0, v1, v3, v3 + // v_fma_f32 v1, v0, v2, v2 + // Here, we have overwritten v0 before we use it. This function checks if + // unpacking can lead to such a situation. + bool canUnpackingClobberRegister(const MachineInstr &MI); + // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and + // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for + // this transformation. + void performF32Unpacking(MachineInstr &I); + // Select corresponding unpacked instruction + uint16_t mapToUnpackedOpcode(MachineInstr &I); + // Creates the unpacked instruction to be inserted. Adds source modifiers to + // the unpacked instructions based on the source modifiers in the packed + // instruction. + MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode, + bool IsHiBits); + // Process operands/source modifiers from packed instructions and insert the + // appropriate source modifers and operands into the unpacked instructions. + void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods, + bool IsHiBits, const MachineOperand &SrcMO); public: bool run(MachineFunction &MF); @@ -274,11 +313,9 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, return false; if (IdxReg && I->modifiesRegister(IdxReg, TRI)) return false; - if (llvm::any_of(I->operands(), - [&MRI, this](const MachineOperand &MO) { - return MO.isReg() && - TRI->isVectorRegister(MRI, MO.getReg()); - })) { + if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) { + return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg()); + })) { // The only exception allowed here is another indirect vector move // with the same mode. if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write || @@ -417,6 +454,279 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } +// If support is extended to new operations, add tests in +// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. +bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { + if (!TII->isNeverCoissue(MI)) + return false; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_FMA_F32: + return true; + default: + return false; + } + llvm_unreachable("Fully covered switch"); +} + +bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { + unsigned OpCode = MI.getOpcode(); + Register DstReg = MI.getOperand(0).getReg(); + // Only the first register in the register pair needs to be checked due to the + // unpacking order. Packed instructions are unpacked such that the lower 32 + // bits (i.e., the first register in the pair) are written first. This can + // introduce dependencies if the first register is written in one instruction + // and then read as part of the higher 32 bits in the subsequent instruction. + // Such scenarios can arise due to specific combinations of op_sel and + // op_sel_hi modifiers. + Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); + + const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0MO && Src0MO->isReg()) { + Register SrcReg0 = Src0MO->getReg(); + unsigned Src0Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); + Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) + : TRI->getSubReg(SrcReg0, AMDGPU::sub0); + // Check if the register selected by op_sel_hi is the same as the first + // register in the destination register pair. + if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) + return true; + } + + const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1MO && Src1MO->isReg()) { + Register SrcReg1 = Src1MO->getReg(); + unsigned Src1Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); + Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) + : TRI->getSubReg(SrcReg1, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) + return true; + } + + // Applicable for packed instructions with 3 source operands, such as + // V_PK_FMA. + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + const MachineOperand *Src2MO = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2MO && Src2MO->isReg()) { + Register SrcReg2 = Src2MO->getReg(); + unsigned Src2Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); + Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) + : TRI->getSubReg(SrcReg2, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + return true; + } + } + return false; +} + +uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { + unsigned Opcode = I.getOpcode(); + // Use 64 bit encoding to allow use of VOP3 instructions. + // VOP3 e64 instructions allow source modifiers + // e32 instructions don't allow source modifiers. + switch (Opcode) { + case AMDGPU::V_PK_ADD_F32: + return AMDGPU::V_ADD_F32_e64; + case AMDGPU::V_PK_MUL_F32: + return AMDGPU::V_MUL_F32_e64; + case AMDGPU::V_PK_FMA_F32: + return AMDGPU::V_FMA_F32_e64; + default: + return std::numeric_limits::max(); + } + llvm_unreachable("Fully covered switch"); +} + +void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI, + unsigned SrcMods, bool IsHiBits, + const MachineOperand &SrcMO) { + unsigned NewSrcMods = 0; + unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done + // for ABS modifiers. + // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit + // lane. + // NEG_HI shares the same bit position with ABS. But packed instructions do + // not support ABS. Therefore, NEG_HI must be translated to NEG source + // modifier for the higher 32 bits. Unpacked VOP3 instructions support + // ABS, but do not support NEG_HI. Therefore we need to explicitly add the + // NEG modifier if present in the packed instruction. + if (SrcMods & NegModifier) + NewSrcMods |= SISrcMods::NEG; + // Src modifiers. Only negative modifiers are added if needed. Unpacked + // operations do not have op_sel, therefore it must be handled explicitly as + // done below. + NewMI.addImm(NewSrcMods); + if (SrcMO.isImm()) { + NewMI.addImm(SrcMO.getImm()); + return; + } + // If op_sel == 0, select register 0 of reg:sub0_sub1. + Register UnpackedSrcReg = (SrcMods & OpSelModifier) + ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) + : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); + + MachineOperand UnpackedSrcMO = + MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false); + if (SrcMO.isKill()) { + // For each unpacked instruction, mark its source registers as killed if the + // corresponding source register in the original packed instruction was + // marked as killed. + // + // Exception: + // If the op_sel and op_sel_hi modifiers require both unpacked instructions + // to use the same register (e.g., due to overlapping access to low/high + // bits of the same packed register), then only the *second* (latter) + // instruction should mark the register as killed. This is because the + // second instruction handles the higher bits and is effectively the last + // user of the full register pair. + + bool OpSel = SrcMods & SISrcMods::OP_SEL_0; + bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1; + bool KillState = true; + if ((OpSel == OpSelHi) && !IsHiBits) + KillState = false; + UnpackedSrcMO.setIsKill(KillState); + } + NewMI.add(UnpackedSrcMO); +} + +void SIPreEmitPeephole::collectUnpackingCandidates( + MachineInstr &BeginMI, SetVector &InstrsToUnpack, + uint16_t NumMFMACycles) { + auto *BB = BeginMI.getParent(); + auto E = BB->end(); + int TotalCyclesBetweenCandidates = 0; + auto SchedModel = TII->getSchedModel(); + Register MFMADef = BeginMI.getOperand(0).getReg(); + + for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isMetaInstruction()) + continue; + if ((Instr.isTerminator()) || + (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (SIInstrInfo::modifiesModeRegister(Instr) && + Instr.modifiesRegister(AMDGPU::EXEC, TRI))) + return; + + const MCSchedClassDesc *InstrSchedClassDesc = + SchedModel.resolveSchedClass(&Instr); + uint16_t Latency = + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + TotalCyclesBetweenCandidates += Latency; + + if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1) + return; + // Identify register dependencies between those used by the MFMA + // instruction and the following packed instructions. Also checks for + // transitive dependencies between the MFMA def and candidate instruction + // def and uses. Conservatively ensures that we do not incorrectly + // read/write registers. + for (const MachineOperand &InstrMO : Instr.operands()) { + if (!InstrMO.isReg() || !InstrMO.getReg().isValid()) + continue; + if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) + return; + } + if (!isUnpackingSupportedInstr(Instr)) + continue; + + if (canUnpackingClobberRegister(Instr)) + return; + // If it's a packed instruction, adjust latency: remove the packed + // latency, add latency of two unpacked instructions (currently estimated + // as 2 cycles). + TotalCyclesBetweenCandidates -= Latency; + // TODO: improve latency handling based on instruction modeling. + TotalCyclesBetweenCandidates += 2; + // Subtract 1 to account for MFMA issue latency. + if (TotalCyclesBetweenCandidates < NumMFMACycles - 1) + InstrsToUnpack.insert(&Instr); + } + return; +} + +void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { + MachineOperand DstOp = I.getOperand(0); + + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + assert(UnpackedOpcode != std::numeric_limits::max() && + "Unsupported Opcode"); + + MachineInstrBuilder Op0LOp1L = + createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false); + MachineOperand LoDstOp = Op0LOp1L->getOperand(0); + + LoDstOp.setIsUndef(DstOp.isUndef()); + + MachineInstrBuilder Op0HOp1H = + createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true); + MachineOperand HiDstOp = Op0HOp1H->getOperand(0); + + if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { + Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept); + } + if (I.getFlag(MachineInstr::MIFlag::FmContract)) { + Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract); + Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract); + } + + LoDstOp.setIsRenamable(DstOp.isRenamable()); + HiDstOp.setIsRenamable(DstOp.isRenamable()); + + I.eraseFromParent(); + return; +} + +MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, + uint16_t UnpackedOpcode, + bool IsHiBits) { + MachineBasicBlock &MBB = *I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + Register DstReg = I.getOperand(0).getReg(); + unsigned OpCode = I.getOpcode(); + Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) + : TRI->getSubReg(DstReg, AMDGPU::sub0); + + int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm(); + unsigned Src0Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm(); + + MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + NewMI.addDef(UnpackedDstReg); // vdst + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + const MachineOperand *SrcMO3 = + TII->getNamedOperand(I, AMDGPU::OpName::src2); + unsigned Src2Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); + } + NewMI.addImm(ClampVal); // clamp + // Packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + NewMI.addImm(0); // omod + return NewMI; +} + PreservedAnalyses llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { @@ -483,5 +793,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { } } + // TODO: Fold this into previous block, if possible. Evaluate and handle any + // side effects. + for (MachineBasicBlock &MBB : MF) { + // Unpack packed instructions overlapped by MFMAs. This allows the compiler + // to co-issue unpacked instructions with MFMA + auto SchedModel = TII->getSchedModel(); + SetVector InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (!SIInstrInfo::isMFMA(MI)) + continue; + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + uint16_t NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + } + for (MachineInstr *MI : InstrsToUnpack) { + performF32Unpacking(*MI); + } + } + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 2e7f25b67fb63..b92b741b70c79 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -224,6 +224,12 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", fshr, null_frag>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile, int_amdgcn_alignbyte>; + +// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32. +// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored. +defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile>; +defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile>; + let True16Predicate = UseRealTrue16Insts in defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16>; let True16Predicate = UseFakeTrue16Insts in @@ -265,6 +271,16 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 } // End isReMaterializable = 1 +let SubtargetPredicate = isGFX9GFX10 in +def : GCNPat < +(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)), + (i32 (VOP3OpSelMods i32:$src1, i32:$src1_modifiers)), + (i32 (VOP3OpSelMods i32:$src2, i32:$src2_modifiers)))), +(V_ALIGNBYTE_B32_opsel_e64 i32:$src0_modifiers, VSrc_b32:$src0, + i32:$src1_modifiers, VSrc_b32:$src1, + i32:$src2_modifiers, VGPR_32:$src2) +>; + let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)), @@ -956,10 +972,10 @@ class SrcAndDstSelToOpSelXForm : SDNodeXFormgetZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1003,7 +1019,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">, def DstSelToOpSel3XForm : SDNodeXFormgetZExtValue(); return CurDAG->getTargetConstant( - (V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE, + (V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE, SDLoc(N), MVT::i32); }]>; def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">, @@ -1954,6 +1970,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" +defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; +defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; + defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { @@ -2104,8 +2123,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>; defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; @@ -2248,6 +2267,17 @@ multiclass VOP3_Real_BITOP3_gfx9 op, string AsmName, bit isSingle = 0> } } +// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi. +// The following is created to support that. +multiclass VOP3OpSel_Real_gfx9_with_name op, string opName, string AsmName> { + defvar psName = opName#"_e64"; + def _gfx9 : VOP3_Real(psName), SIEncodingFamily.VI>, // note: encoding family is VI + VOP3OpSel_gfx9 (psName).Pfl> { + VOP3_Pseudo ps = !cast(psName); + let AsmString = AsmName # ps.AsmOperands; + } +} + } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; @@ -2267,8 +2297,10 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; +let SubtargetPredicate = isGFX8Only in { defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; +} defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; @@ -2313,6 +2345,9 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16" defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; +defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; +defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; + defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 3f2b12fd41b03..618a2089eb664 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -892,9 +892,14 @@ class MAIInst : MAIInst { // Append operands from V_MFMA_LD_SCALE_B32, but we need to rename them. + // Restrict to VGPR only (VRegSrc_32) for the scale operands to workaround a + // hardware design defect: For all Inline/SGPR constants, SP HW use bits + // [30:23] as the scale. + // TODO: We may still be able to allow Inline Constants/SGPR, with a proper + // shift, to obtain a potentially better performance. let InOperandList = !con(BaseInst.InOperandList, - (ins VSrc_b32:$scale_src0, - VSrc_b32:$scale_src1, + (ins VRegSrc_32:$scale_src0, + VRegSrc_32:$scale_src1, op_sel0:$src0_modifiers, op_sel_hi0:$src1_modifiers)); let AsmOperands = diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3beda6bc5ba38..1e9c37d79318f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,10 +11,13 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/FloatingPointPredicateUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" @@ -3592,6 +3595,230 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl" +/// patterns, which extract vector elements and pack them in the same relative +/// positions. +/// +/// \p Vec is the underlying vector being extracted from. +/// \p Mask is a bitmask identifying which packed elements are obtained from the +/// vector. +/// \p VecOffset is the vector element corresponding to index 0 of the +/// mask. +static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec, + int64_t &VecOffset, + SmallBitVector &Mask, + const DataLayout &DL) { + static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) { + ShlAmt = 0; + return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base); + }; + + // First try to match extractelement -> zext -> shl + uint64_t VecIdx, ShlAmt; + if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt( + m_Value(Vec), m_ConstantInt(VecIdx))), + ShlAmt))) { + auto *VecTy = dyn_cast(Vec->getType()); + if (!VecTy) + return false; + auto *EltTy = dyn_cast(VecTy->getElementType()); + if (!EltTy) + return false; + + const unsigned EltBitWidth = EltTy->getBitWidth(); + const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth(); + if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0) + return false; + const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth; + const unsigned ShlEltAmt = ShlAmt / EltBitWidth; + + const unsigned MaskIdx = + DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1; + + VecOffset = static_cast(VecIdx) - static_cast(MaskIdx); + Mask.resize(TargetEltWidth); + Mask.set(MaskIdx); + return true; + } + + // Now try to match a bitcasted subvector. + Instruction *SrcVecI; + if (!match(V, m_BitCast(m_Instruction(SrcVecI)))) + return false; + + auto *SrcTy = dyn_cast(SrcVecI->getType()); + if (!SrcTy) + return false; + + Mask.resize(SrcTy->getNumElements()); + + // First check for a subvector obtained from a shufflevector. + if (isa(SrcVecI)) { + Constant *ConstVec; + ArrayRef ShuffleMask; + if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec), + m_Mask(ShuffleMask)))) + return false; + + auto *VecTy = dyn_cast(Vec->getType()); + if (!VecTy) + return false; + + const unsigned NumVecElts = VecTy->getNumElements(); + bool FoundVecOffset = false; + for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) { + if (ShuffleMask[Idx] == PoisonMaskElem) + return false; + const unsigned ShuffleIdx = ShuffleMask[Idx]; + if (ShuffleIdx >= NumVecElts) { + const unsigned ConstIdx = ShuffleIdx - NumVecElts; + auto *ConstElt = + dyn_cast(ConstVec->getAggregateElement(ConstIdx)); + if (!ConstElt || !ConstElt->isNullValue()) + return false; + continue; + } + + if (FoundVecOffset) { + if (VecOffset + Idx != ShuffleIdx) + return false; + } else { + if (ShuffleIdx < Idx) + return false; + VecOffset = ShuffleIdx - Idx; + FoundVecOffset = true; + } + Mask.set(Idx); + } + return FoundVecOffset; + } + + // Check for a subvector obtained as an (insertelement V, 0, idx) + uint64_t InsertIdx; + if (!match(SrcVecI, + m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx)))) + return false; + + auto *VecTy = dyn_cast(Vec->getType()); + if (!VecTy) + return false; + VecOffset = 0; + bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx); + Mask.set(); + if (!AlreadyInsertedMaskedElt) + Mask.reset(InsertIdx); + return true; +} + +/// Try to fold the join of two scalar integers whose contents are packed +/// elements of the same vector. +static Instruction *foldIntegerPackFromVector(Instruction &I, + InstCombiner::BuilderTy &Builder, + const DataLayout &DL) { + assert(I.getOpcode() == Instruction::Or); + Value *LhsVec, *RhsVec; + int64_t LhsVecOffset, RhsVecOffset; + SmallBitVector Mask; + if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset, + Mask, DL)) + return nullptr; + if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset, + Mask, DL)) + return nullptr; + if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset) + return nullptr; + + // Convert into shufflevector -> bitcast; + const unsigned ZeroVecIdx = + cast(LhsVec->getType())->getNumElements(); + SmallVector ShuffleMask(Mask.size(), ZeroVecIdx); + for (unsigned Idx : Mask.set_bits()) { + assert(LhsVecOffset + Idx >= 0); + ShuffleMask[Idx] = LhsVecOffset + Idx; + } + + Value *MaskedVec = Builder.CreateShuffleVector( + LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask, + I.getName() + ".v"); + return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType()); +} + +/// Match \p V as "lshr -> mask -> zext -> shl". +/// +/// \p Int is the underlying integer being extracted from. +/// \p Mask is a bitmask identifying which bits of the integer are being +/// extracted. \p Offset identifies which bit of the result \p V corresponds to +/// the least significant bit of \p Int +static bool matchZExtedSubInteger(Value *V, Value *&Int, APInt &Mask, + uint64_t &Offset, bool &IsShlNUW, + bool &IsShlNSW) { + Value *ShlOp0; + uint64_t ShlAmt = 0; + if (!match(V, m_OneUse(m_Shl(m_Value(ShlOp0), m_ConstantInt(ShlAmt))))) + return false; + + IsShlNUW = cast(V)->hasNoUnsignedWrap(); + IsShlNSW = cast(V)->hasNoSignedWrap(); + + Value *ZExtOp0; + if (!match(ShlOp0, m_OneUse(m_ZExt(m_Value(ZExtOp0))))) + return false; + + Value *MaskedOp0; + const APInt *ShiftedMaskConst = nullptr; + if (!match(ZExtOp0, m_CombineOr(m_OneUse(m_And(m_Value(MaskedOp0), + m_APInt(ShiftedMaskConst))), + m_Value(MaskedOp0)))) + return false; + + uint64_t LShrAmt = 0; + if (!match(MaskedOp0, + m_CombineOr(m_OneUse(m_LShr(m_Value(Int), m_ConstantInt(LShrAmt))), + m_Value(Int)))) + return false; + + if (LShrAmt > ShlAmt) + return false; + Offset = ShlAmt - LShrAmt; + + Mask = ShiftedMaskConst ? ShiftedMaskConst->shl(LShrAmt) + : APInt::getBitsSetFrom( + Int->getType()->getScalarSizeInBits(), LShrAmt); + + return true; +} + +/// Try to fold the join of two scalar integers whose bits are unpacked and +/// zexted from the same source integer. +static Value *foldIntegerRepackThroughZExt(Value *Lhs, Value *Rhs, + InstCombiner::BuilderTy &Builder) { + + Value *LhsInt, *RhsInt; + APInt LhsMask, RhsMask; + uint64_t LhsOffset, RhsOffset; + bool IsLhsShlNUW, IsLhsShlNSW, IsRhsShlNUW, IsRhsShlNSW; + if (!matchZExtedSubInteger(Lhs, LhsInt, LhsMask, LhsOffset, IsLhsShlNUW, + IsLhsShlNSW)) + return nullptr; + if (!matchZExtedSubInteger(Rhs, RhsInt, RhsMask, RhsOffset, IsRhsShlNUW, + IsRhsShlNSW)) + return nullptr; + if (LhsInt != RhsInt || LhsOffset != RhsOffset) + return nullptr; + + APInt Mask = LhsMask | RhsMask; + + Type *DestTy = Lhs->getType(); + Value *Res = Builder.CreateShl( + Builder.CreateZExt( + Builder.CreateAnd(LhsInt, Mask, LhsInt->getName() + ".mask"), DestTy, + LhsInt->getName() + ".zext"), + ConstantInt::get(DestTy, LhsOffset), "", IsLhsShlNUW && IsRhsShlNUW, + IsLhsShlNSW && IsRhsShlNSW); + Res->takeName(Lhs); + return Res; +} + // A decomposition of ((X & Mask) * Factor). The NUW / NSW bools // track these properities for preservation. Note that we can decompose // equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * @@ -3693,6 +3920,8 @@ static Value *foldBitmaskMul(Value *Op0, Value *Op1, Value *InstCombinerImpl::foldDisjointOr(Value *LHS, Value *RHS) { if (Value *Res = foldBitmaskMul(LHS, RHS, Builder)) return Res; + if (Value *Res = foldIntegerRepackThroughZExt(LHS, RHS, Builder)) + return Res; return nullptr; } @@ -3770,6 +3999,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldComplexAndOrPatterns(I, Builder)) return X; + if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL)) + return X; + // (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D // (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C if (Value *V = foldOrOfInversions(I, Builder)) @@ -3823,7 +4055,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { /*NSW=*/true, /*NUW=*/true)) return R; - if (Value *Res = foldBitmaskMul(I.getOperand(0), I.getOperand(1), Builder)) + if (Value *Res = foldDisjointOr(I.getOperand(0), I.getOperand(1))) return replaceInstUsesWith(I, Res); if (Value *Res = reassociateDisjointOr(I.getOperand(0), I.getOperand(1))) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index becda960a16f0..e6d537f4678ee 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -83,6 +83,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include @@ -5007,6 +5008,34 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // FIXME: We might want to defer PHI speculation until after here. // FIXME: return nullptr; } else { + // AMDGPU: If the target is AMDGPU and the chosen SliceTy is a HIP vector + // struct of 2 or 4 identical elements, canonicalize it to an IR vector. + // This helps SROA treat it as a single value and unlock vector ld/st. + // We pattern-match struct names starting with "struct.HIP_vector". + if (Function *F = AI.getFunction()) { + Triple TT(F->getParent()->getTargetTriple()); + if (TT.isAMDGPU()) { + if (auto *STy = dyn_cast(SliceTy)) { + StringRef Name = STy->hasName() ? STy->getName() : StringRef(); + if (Name.starts_with("struct.HIP_vector")) { + unsigned NumElts = STy->getNumElements(); + if ((NumElts == 2 || NumElts == 4) && NumElts > 0) { + Type *EltTy = STy->getElementType(0); + bool AllSame = true; + for (unsigned I = 1; I < NumElts; ++I) + if (STy->getElementType(I) != EltTy) { + AllSame = false; + break; + } + if (AllSame && VectorType::isValidElementType(EltTy)) { + SliceTy = FixedVectorType::get(EltTy, NumElts); + } + } + } + } + } + } + // Make sure the alignment is compatible with P.beginOffset(). const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset()); // If we will get at least this much alignment from the type alone, leave diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 878e2859c4a2f..1a43b959ee90d 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -502,10 +502,18 @@ if(build_runtimes) endif() # Forward user-provived system configuration to runtimes for requirement introspection. - # CMAKE_PREFIX_PATH is the search path for CMake packages. + # CMAKE_PREFIX_PATH is the search path for CMake packages. In order to pass through + # the command line interface, the CMake semicolon separator needs to be replaced + # with $ if(CMAKE_PREFIX_PATH) - list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}") + string(JOIN "$" escaped_cmake_prefix_path ${CMAKE_PREFIX_PATH}) + # Some projects require access to the LLVM lib/cmake directory + if (OFFLOAD_EXTERNAL_PROJECT_UNIFIED_ROCR OR DEFINED LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH) + string(PREPEND escaped_cmake_prefix_path "${CMAKE_BINARY_DIR}/lib/cmake$") + endif() + list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${escaped_cmake_prefix_path}") endif() + # CMAKE_PROGRAM_PATH is the search path for executables such as python. if(CMAKE_PROGRAM_PATH) list(APPEND extra_cmake_args "-DCMAKE_PROGRAM_PATH=${CMAKE_PROGRAM_PATH}") @@ -535,7 +543,8 @@ if(build_runtimes) CMAKE_ARGS -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DIMAGE_SUPPORT=OFF - -DLLVM_RUNTIME_OPENMP=ON) + -DLLVM_RUNTIME_OPENMP=ON + ${extra_cmake_args}) set(HSA_DEP rocr-runtime) endif() @@ -551,7 +560,8 @@ if(build_runtimes) CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCM_DEVICE_LIBS_INSTALL_PREFIX_PATH} -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_NEW=${ROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC} - -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn) + -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn + ${extra_cmake_args}) else() ExternalProject_Add(rocm-device-libs SOURCE_DIR ${LIBOMPTARGET_EXTERNAL_PROJECT_ROCM_DEVICE_LIBS_PATH} @@ -559,7 +569,8 @@ if(build_runtimes) INSTALL_COMMAND "" CMAKE_ARGS -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_NEW=${ROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC} - -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn) + -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn + ${extra_cmake_args}) endif() endif() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index aeb301939e986..dac726df5decb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. @@ -91,7 +91,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -233,7 +233,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -291,7 +291,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -368,7 +368,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -451,7 +451,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -603,7 +603,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -676,7 +676,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -749,7 +749,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -846,7 +846,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out.gep, align 4 ret void } @@ -930,7 +930,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -1119,8 +1119,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -1218,8 +1218,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1384,8 +1384,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1470,8 +1470,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1599,8 +1599,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out.gep, align 4 ret void } @@ -1706,8 +1706,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1813,7 +1813,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -1926,8 +1926,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2102,8 +2102,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2333,8 +2333,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out.gep, align 4 ret void } @@ -2444,8 +2444,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2540,7 +2540,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i32 %result, ptr addrspace(1) %out, align 4 ret void @@ -2629,7 +2629,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -2781,7 +2781,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -2844,7 +2844,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -2926,7 +2926,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -3014,7 +3014,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -3102,7 +3102,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -3176,7 +3176,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3254,7 +3254,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3332,7 +3332,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3434,7 +3434,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out.gep, align 4 ret void } @@ -3523,7 +3523,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3624,7 +3624,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -3635,6 +3635,7 @@ attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } !0 = !{i32 5, i32 6} +!1 = !{} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 788a4e6fb2141..77d212afa0594 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. @@ -424,7 +424,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -518,7 +518,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -614,7 +614,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -693,7 +693,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -776,7 +776,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -861,7 +861,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -971,7 +971,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out.gep, align 4 ret void } @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -1827,7 +1827,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -1912,7 +1912,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -2001,7 +2001,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -2092,7 +2092,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out.gep, align 4 ret void } @@ -2310,7 +2310,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -2414,7 +2414,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -2525,8 +2525,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -2639,8 +2639,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -2731,7 +2731,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -2827,8 +2827,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -2926,8 +2926,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -3077,8 +3077,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 + %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out.gep, align 4 ret void } @@ -3201,8 +3201,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 + %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -3571,8 +3571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -3701,8 +3701,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -3799,7 +3799,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -3901,8 +3901,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -4006,8 +4006,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -4169,8 +4169,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out.gep, align 4 ret void } @@ -4297,8 +4297,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -4434,6 +4434,7 @@ attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } !0 = !{i32 5, i32 6} +!1 = !{} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 8a80afd4a768f..bbf563f5bc543 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1933,8 +1933,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_movk_i32 s0, 0x3e84 +; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -1949,7 +1950,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e84 +; GFX10-NEXT: s_movk_i32 s0, 0x3e80 +; GFX10-NEXT: s_add_i32 s0, s0, 4 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -1961,10 +1963,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX942-LABEL: store_load_large_imm_offset_kernel: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 15 -; GFX942-NEXT: s_movk_i32 s0, 0x3e84 +; GFX942-NEXT: s_add_i32 s0, s0, 4 ; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -1974,7 +1977,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x3e84 +; GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -2002,8 +2007,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2018,7 +2024,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2030,10 +2037,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2043,7 +2051,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -2077,11 +2087,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX9-NEXT: s_add_i32 s0, s1, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2092,8 +2104,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: s_movk_i32 s0, 0x3e80 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX10-NEXT: s_add_i32 s1, s32, s0 +; GFX10-NEXT: s_add_i32 s0, s1, 4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2105,11 +2119,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX942-LABEL: store_load_large_imm_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: s_add_i32 s1, s32, s0 ; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 15 -; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX942-NEXT: s_add_i32 s0, s1, 4 ; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2120,7 +2136,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s1, s32, s0 +; GFX11-NEXT: s_add_i32 s0, s1, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -2149,11 +2168,13 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2164,8 +2185,10 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0 +; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2177,11 +2200,13 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2192,7 +2217,10 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0 +; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir index 5b8c2840b0156..dde566d9643d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s --- name: bswap_i32_vv @@ -19,6 +21,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]] + ; ; GFX8-LABEL: name: bswap_i32_vv ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -26,6 +29,22 @@ body: | ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX9-LABEL: name: bswap_i32_vv + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX10-LABEL: name: bswap_i32_vv + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_BSWAP %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir index 0a4cb3ccf2957..fa95f33909b76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s --- @@ -24,6 +24,24 @@ body: | ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] ; + ; GFX9-LABEL: name: fshr_s32 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; + ; GFX10-LABEL: name: fshr_s32 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; ; GFX11-LABEL: name: fshr_s32 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll index 726bfbab7ad48..45c06cae585cc 100644 --- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -203,7 +203,7 @@ define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(ptr addrspace(1) bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tid - %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 syncscope("agent") seq_cst + %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 %tmp0 = insertelement <4 x i32> poison, i32 %in.1, i32 0 %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1 %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(ptr addrspace(1 bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid - %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 syncscope("agent") seq_cst + %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 %tmp0 = insertelement <2 x i64> poison, i64 %in.1, i32 0 %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1 %tmp2 = bitcast <2 x i64> %tmp1 to <4 x i32> @@ -319,3 +319,5 @@ exit: } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index 28f55511ebb6f..42c7b90da63d3 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -169,6 +169,6 @@ attributes #1 = { nounwind } ;. ; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll new file mode 100644 index 0000000000000..974cd36b88e87 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes='amdgpu-attributor' %s -o - | FileCheck %s + +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 +;. +define amdgpu_kernel void @k0() #0 { +; CHECK: Function Attrs: sanitize_address +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: store i8 7, ptr addrspace(3) @lds_1, align 4 +; CHECK-NEXT: ret void +; + store i8 7, ptr addrspace(3) @lds_1, align 4 + ret void +} + +attributes #0 = { sanitize_address } +; "amdgpu-no-flat-scratch-init" attribute should not be present in attribute list +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll index 66b6910ff6db5..58987c8e50948 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -105,7 +105,7 @@ declare void @unknown() define amdgpu_kernel void @kernel_calls_extern() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern( -; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() { define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite( -; CHECK-SAME: ) #[[ATTR3]] { -; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] ; CHECK-NEXT: ret void ; call void @unknown() #0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect( -; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] { +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: call void [[INDIRECT]]() ; CHECK-NEXT: ret void ; @@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite( -; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] { -; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]] +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]] ; CHECK-NEXT: ret void ; call void %indirect() #0 @@ -242,12 +242,11 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { attributes #0 = { "amdgpu-agpr-alloc"="0" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll index 28722021e0448..7e0208cd1f45a 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll @@ -117,14 +117,14 @@ define void @call_no_dispatch_id() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index cdb9801e660bc..f8fb18ba189a5 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -442,7 +442,7 @@ define internal void @defined.func() #3 { define void @func_call_external() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -462,7 +462,7 @@ define void @func_call_defined() #3 { define void @func_call_asm() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { -; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR25:[0-9]+]] +; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR24:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void asm sideeffect "", ""() #3 @@ -471,7 +471,7 @@ define void @func_call_asm() #3 { define amdgpu_kernel void @kern_call_external() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -515,7 +515,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { define float @func_indirect_call(ptr %fptr) #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -528,7 +528,7 @@ define float @func_indirect_call(ptr %fptr) #3 { declare float @extern() #3 define float @func_extern_call() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -540,7 +540,7 @@ define float @func_extern_call() #3 { define float @func_null_call(ptr %fptr) #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -568,7 +568,7 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_sanitize_address() #4 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -579,7 +579,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_sanitize_address() #4 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -590,7 +590,7 @@ define void @func_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_indirect_sanitize_address() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -601,7 +601,7 @@ define void @func_indirect_sanitize_address() #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -615,7 +615,7 @@ declare void @extern_func_sanitize_address() #5 define amdgpu_kernel void @kern_decl_sanitize_address() #3 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -627,7 +627,7 @@ declare void @enqueue_block_decl() #6 define internal void @enqueue_block_def() #6 { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -635,7 +635,7 @@ define internal void @enqueue_block_def() #6 { define amdgpu_kernel void @kern_call_enqueued_block_decl() { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -645,7 +645,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { define amdgpu_kernel void @kern_call_enqueued_block_def() { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -655,7 +655,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() { define void @unused_enqueue_block() { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -663,7 +663,7 @@ define void @unused_enqueue_block() { define internal void @known_func() { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -672,8 +672,8 @@ define internal void @known_func() { ; Should never happen define amdgpu_kernel void @kern_callsite_enqueue_block() { ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { -; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR26:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] { +; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR25:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void @known_func() #6 @@ -691,30 +691,29 @@ attributes #6 = { "enqueued-block" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15:[0-9]+]] = { nounwind "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind } -; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "enqueued-block" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "enqueued-block" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 894ef4fa7c976..26c04a35edf16 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -303,7 +303,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr ; HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { ; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]] ; HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(3) %ptr to ptr @@ -315,7 +315,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast ; HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META1:![0-9]+]] ; HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr @@ -352,7 +352,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt ; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast ; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { ; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr -; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META2:![0-9]+]] ; HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(1) %ptr to ptr @@ -364,7 +364,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) % ; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast ; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { ; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr -; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4 +; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4, !noalias.addrspace [[META3:![0-9]+]] ; HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(4) %ptr to ptr @@ -474,17 +474,22 @@ attributes #1 = { nounwind } ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +;. +; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10} +; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10} +; HSA: [[META2]] = !{i32 2, i32 10} +; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 10} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index aeca1384f09a2..81ccf16c4e4bc 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -294,13 +294,13 @@ attributes #1 = { nounwind } ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 3ca7db155b385..8e7c1cfa1f5a0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,30 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s -; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_DPP,GFX1164_DPP-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_DPP,GFX1132_DPP-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_DPP,GFX1132_DPP-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s +; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_DPP,GFX1164_DPP-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_DPP,GFX1132_DPP-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_DPP,GFX1132_DPP-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -328,7 +328,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel + %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i32 %old, ptr addrspace(1) %out ret void } @@ -655,7 +655,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel + %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i32 %old, ptr addrspace(1) %out ret void } @@ -1565,7 +1565,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel + %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i32 %old, ptr addrspace(1) %out ret void } @@ -1899,7 +1899,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw add ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel + %old = atomicrmw add ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i64 %old, ptr addrspace(1) %out ret void } @@ -2284,7 +2284,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel + %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i64 %old, ptr addrspace(1) %out ret void } @@ -3545,7 +3545,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 - %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel + %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i64 %old, ptr addrspace(1) %out ret void } @@ -3861,7 +3861,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel + %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i32 %old, ptr addrspace(1) %out ret void } @@ -4190,7 +4190,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel + %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i32 %old, ptr addrspace(1) %out ret void } @@ -5100,7 +5100,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel + %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i32 %old, ptr addrspace(1) %out ret void } @@ -5454,7 +5454,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel + %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i64 %old, ptr addrspace(1) %out ret void } @@ -5850,7 +5850,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: - %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel + %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i64 %old, ptr addrspace(1) %out ret void } @@ -7111,3025 +7111,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 - %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel + %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0 store i64 %old, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) { -; GFX7LESS-LABEL: uniform_or_i8: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s8, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_and_b32 s7, s6, 0xff -; GFX7LESS-NEXT: s_lshl_b32 s7, s7, s2 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s7 -; GFX7LESS-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX7LESS-NEXT: .LBB12_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: uniform_or_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB12_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s8, s2, -4 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s3, s2 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX8-NEXT: .LBB12_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: uniform_or_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s2, -4 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s3, s3, s2 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX9-NEXT: .LBB12_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: uniform_or_i8: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s7, s2, 3 -; GFX1064-NEXT: s_and_b32 s8, s6, 0xff -; GFX1064-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1064-NEXT: s_and_b32 s8, s2, -4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s9 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1064-NEXT: .LBB12_2: -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: uniform_or_i8: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s5, s2, 3 -; GFX1032-NEXT: s_and_b32 s7, s6, 0xff -; GFX1032-NEXT: s_lshl_b32 s5, s5, 3 -; GFX1032-NEXT: s_and_b32 s8, s2, -4 -; GFX1032-NEXT: s_lshl_b32 s7, s7, s5 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s7 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0 -; GFX1032-NEXT: .LBB12_2: -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo -; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-TRUE16-LABEL: uniform_or_i8: -; GFX1164-TRUE16: ; %bb.0: -; GFX1164-TRUE16-NEXT: s_clause 0x1 -; GFX1164-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1164-TRUE16-NEXT: ; %bb.1: -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1164-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-TRUE16-NEXT: .LBB12_2: -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1164-TRUE16-NEXT: s_endpgm -; -; GFX1164-FAKE16-LABEL: uniform_or_i8: -; GFX1164-FAKE16: ; %bb.0: -; GFX1164-FAKE16-NEXT: s_clause 0x1 -; GFX1164-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1164-FAKE16-NEXT: ; %bb.1: -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1164-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-FAKE16-NEXT: .LBB12_2: -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1164-FAKE16-NEXT: s_endpgm -; -; GFX1132-TRUE16-LABEL: uniform_or_i8: -; GFX1132-TRUE16: ; %bb.0: -; GFX1132-TRUE16-NEXT: s_clause 0x1 -; GFX1132-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1132-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1132-TRUE16-NEXT: ; %bb.1: -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1132-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-TRUE16-NEXT: .LBB12_2: -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1132-TRUE16-NEXT: s_endpgm -; -; GFX1132-FAKE16-LABEL: uniform_or_i8: -; GFX1132-FAKE16: ; %bb.0: -; GFX1132-FAKE16-NEXT: s_clause 0x1 -; GFX1132-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1132-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1132-FAKE16-NEXT: ; %bb.1: -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1132-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-FAKE16-NEXT: .LBB12_2: -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1132-FAKE16-NEXT: s_endpgm -; -; GFX1264-TRUE16-LABEL: uniform_or_i8: -; GFX1264-TRUE16: ; %bb.0: -; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1264-TRUE16-NEXT: ; %bb.1: -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1264-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-TRUE16-NEXT: .LBB12_2: -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1264-TRUE16-NEXT: s_endpgm -; -; GFX1264-FAKE16-LABEL: uniform_or_i8: -; GFX1264-FAKE16: ; %bb.0: -; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1264-FAKE16-NEXT: ; %bb.1: -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1264-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-FAKE16-NEXT: .LBB12_2: -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1264-FAKE16-NEXT: s_endpgm -; -; GFX1232-TRUE16-LABEL: uniform_or_i8: -; GFX1232-TRUE16: ; %bb.0: -; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1232-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1232-TRUE16-NEXT: ; %bb.1: -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1232-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-TRUE16-NEXT: .LBB12_2: -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1232-TRUE16-NEXT: s_endpgm -; -; GFX1232-FAKE16-LABEL: uniform_or_i8: -; GFX1232-FAKE16: ; %bb.0: -; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1232-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB12_2 -; GFX1232-FAKE16-NEXT: ; %bb.1: -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1232-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-FAKE16-NEXT: .LBB12_2: -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1 - store i8 %rmw, ptr addrspace(1) %result - ret void -} - -define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) { -; GFX7LESS-LABEL: uniform_add_i8: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s10, s[4:5], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_4 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s4, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s5, s3 -; GFX7LESS-NEXT: s_and_b32 s11, s2, 3 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], 0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mul_i32 s6, s10, s6 -; GFX7LESS-NEXT: s_lshl_b32 s11, s11, 3 -; GFX7LESS-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX7LESS-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX7LESS-NEXT: s_and_b32 s6, s6, 0xff -; GFX7LESS-NEXT: s_not_b32 s13, s12 -; GFX7LESS-NEXT: s_lshl_b32 s14, s6, s11 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v2, s13, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 -; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX7LESS-NEXT: .LBB13_4: ; %Flow -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xff -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0 -; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: uniform_add_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s10, s[4:5], 0x34 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX8-NEXT: s_cbranch_execz .LBB13_4 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s6, s10, s4 -; GFX8-NEXT: s_and_b32 s4, s2, -4 -; GFX8-NEXT: s_mov_b32 s5, s3 -; GFX8-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_lshl_b32 s11, s2, 3 -; GFX8-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX8-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NEXT: s_not_b32 s13, s12 -; GFX8-NEXT: s_lshl_b32 s14, s2, s11 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1 -; GFX8-NEXT: v_and_b32_e32 v2, s13, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB13_2 -; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX8-NEXT: .LBB13_4: ; %Flow -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_mad_u16 v0, s10, v4, v0 -; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: uniform_add_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s10, s[4:5], 0x34 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_4 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s6, s10, s4 -; GFX9-NEXT: s_and_b32 s4, s2, -4 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s11, s2, 3 -; GFX9-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX9-NEXT: s_and_b32 s2, s6, 0xff -; GFX9-NEXT: s_not_b32 s13, s12 -; GFX9-NEXT: s_lshl_b32 s14, s2, s11 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_u32_e32 v0, s14, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 -; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX9-NEXT: .LBB13_4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v4, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: uniform_add_i8: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[4:5], 0x34 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_4 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s4, s2, -4 -; GFX1064-NEXT: s_mov_b32 s5, s3 -; GFX1064-NEXT: s_and_b32 s2, s2, 3 -; GFX1064-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1064-NEXT: s_mul_i32 s2, s10, s6 -; GFX1064-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX1064-NEXT: s_and_b32 s2, s2, 0xff -; GFX1064-NEXT: s_not_b32 s13, s12 -; GFX1064-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v2 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1064-NEXT: .LBB13_4: ; %Flow -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: uniform_add_i8: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s8, s[4:5], 0x34 -; GFX1032-NEXT: s_mov_b32 s6, exec_lo -; GFX1032-NEXT: s_mov_b32 s10, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_4 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s4, s2, -4 -; GFX1032-NEXT: s_mov_b32 s5, s3 -; GFX1032-NEXT: s_and_b32 s2, s2, 3 -; GFX1032-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1032-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1032-NEXT: s_mul_i32 s6, s8, s6 -; GFX1032-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1032-NEXT: s_and_b32 s6, s6, 0xff -; GFX1032-NEXT: s_not_b32 s11, s3 -; GFX1032-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v2 -; GFX1032-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1032-NEXT: .LBB13_4: ; %Flow -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-TRUE16-LABEL: uniform_add_i8: -; GFX1164-TRUE16: ; %bb.0: -; GFX1164-TRUE16-NEXT: s_clause 0x1 -; GFX1164-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-TRUE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1164-TRUE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-TRUE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1164-TRUE16-NEXT: ; %bb.1: -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1164-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1164-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1164-TRUE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-TRUE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1164-TRUE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX1164-TRUE16-NEXT: s_and_b32 s2, s2, 0xff -; GFX1164-TRUE16-NEXT: s_not_b32 s13, s12 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1164-TRUE16-NEXT: .LBB13_4: ; %Flow -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 -; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1164-TRUE16-NEXT: s_endpgm -; -; GFX1164-FAKE16-LABEL: uniform_add_i8: -; GFX1164-FAKE16: ; %bb.0: -; GFX1164-FAKE16-NEXT: s_clause 0x1 -; GFX1164-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-FAKE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1164-FAKE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-FAKE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1164-FAKE16-NEXT: ; %bb.1: -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1164-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1164-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1164-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1164-FAKE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-FAKE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1164-FAKE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX1164-FAKE16-NEXT: s_and_b32 s2, s2, 0xff -; GFX1164-FAKE16-NEXT: s_not_b32 s13, s12 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1164-FAKE16-NEXT: .LBB13_4: ; %Flow -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 -; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1164-FAKE16-NEXT: s_endpgm -; -; GFX1132-TRUE16-LABEL: uniform_add_i8: -; GFX1132-TRUE16: ; %bb.0: -; GFX1132-TRUE16-NEXT: s_clause 0x1 -; GFX1132-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1132-TRUE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1132-TRUE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1132-TRUE16-NEXT: ; %bb.1: -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1132-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1132-TRUE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1132-TRUE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1132-TRUE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1132-TRUE16-NEXT: s_and_b32 s6, s6, 0xff -; GFX1132-TRUE16-NEXT: s_not_b32 s11, s3 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1132-TRUE16-NEXT: .LBB13_4: ; %Flow -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 -; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1132-TRUE16-NEXT: s_endpgm -; -; GFX1132-FAKE16-LABEL: uniform_add_i8: -; GFX1132-FAKE16: ; %bb.0: -; GFX1132-FAKE16-NEXT: s_clause 0x1 -; GFX1132-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1132-FAKE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1132-FAKE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1132-FAKE16-NEXT: ; %bb.1: -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1132-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1132-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1132-FAKE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1132-FAKE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1132-FAKE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1132-FAKE16-NEXT: s_and_b32 s6, s6, 0xff -; GFX1132-FAKE16-NEXT: s_not_b32 s11, s3 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1132-FAKE16-NEXT: .LBB13_4: ; %Flow -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 -; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1132-FAKE16-NEXT: s_endpgm -; -; GFX1264-TRUE16-LABEL: uniform_add_i8: -; GFX1264-TRUE16: ; %bb.0: -; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1264-TRUE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1264-TRUE16-NEXT: ; %bb.1: -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1264-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1264-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1264-TRUE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-TRUE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1264-TRUE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX1264-TRUE16-NEXT: s_and_b32 s2, s2, 0xff -; GFX1264-TRUE16-NEXT: s_not_b32 s13, s12 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1264-TRUE16-NEXT: .LBB13_4: ; %Flow -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1264-TRUE16-NEXT: s_endpgm -; -; GFX1264-FAKE16-LABEL: uniform_add_i8: -; GFX1264-FAKE16: ; %bb.0: -; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1264-FAKE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-FAKE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1264-FAKE16-NEXT: ; %bb.1: -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1264-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1264-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1264-FAKE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-FAKE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1264-FAKE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s12, 0xff, s11 -; GFX1264-FAKE16-NEXT: s_and_b32 s2, s2, 0xff -; GFX1264-FAKE16-NEXT: s_not_b32 s13, s12 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1264-FAKE16-NEXT: .LBB13_4: ; %Flow -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 -; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1264-FAKE16-NEXT: s_endpgm -; -; GFX1232-TRUE16-LABEL: uniform_add_i8: -; GFX1232-TRUE16: ; %bb.0: -; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1232-TRUE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1232-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1232-TRUE16-NEXT: ; %bb.1: -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1232-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1232-TRUE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1232-TRUE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_and_b32 s6, s6, 0xff -; GFX1232-TRUE16-NEXT: s_not_b32 s11, s3 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1232-TRUE16-NEXT: .LBB13_4: ; %Flow -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1232-TRUE16-NEXT: s_endpgm -; -; GFX1232-FAKE16-LABEL: uniform_add_i8: -; GFX1232-FAKE16: ; %bb.0: -; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1232-FAKE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB13_4 -; GFX1232-FAKE16-NEXT: ; %bb.1: -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1232-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1232-FAKE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1232-FAKE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_and_b32 s6, s6, 0xff -; GFX1232-FAKE16-NEXT: s_not_b32 s11, s3 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start -; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 -; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1232-FAKE16-NEXT: .LBB13_4: ; %Flow -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 -; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1 - store i8 %rmw, ptr addrspace(1) %result - ret void -} - -define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) { -; GFX7LESS-LABEL: uniform_xchg_i8: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd -; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s4, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s5, s3 -; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_and_b32 s3, s6, 0xff -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7LESS-NEXT: s_lshl_b32 s10, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s11, 0xff, s10 -; GFX7LESS-NEXT: s_lshl_b32 s2, s3, s10 -; GFX7LESS-NEXT: s_not_b32 s3, s11 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2 -; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: uniform_xchg_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s4, s2, -4 -; GFX8-NEXT: s_mov_b32 s5, s3 -; GFX8-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_lshl_b32 s8, s2, 3 -; GFX8-NEXT: s_lshl_b32 s2, 0xff, s8 -; GFX8-NEXT: s_not_b32 s9, s2 -; GFX8-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NEXT: s_lshl_b32 s10, s2, s8 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_and_b32_e32 v0, s9, v1 -; GFX8-NEXT: v_or_b32_e32 v0, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v2 -; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: uniform_xchg_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, -4 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s8, s2, 3 -; GFX9-NEXT: s_lshl_b32 s2, 0xff, s8 -; GFX9-NEXT: s_not_b32 s9, s2 -; GFX9-NEXT: s_and_b32 s2, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s2, s8 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s9, v1 -; GFX9-NEXT: v_or_b32_e32 v0, s10, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v2 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: uniform_xchg_i8: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s4, s2, -4 -; GFX1064-NEXT: s_mov_b32 s5, s3 -; GFX1064-NEXT: s_and_b32 s2, s2, 3 -; GFX1064-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX1064-NEXT: s_lshl_b32 s8, s2, 3 -; GFX1064-NEXT: s_and_b32 s6, s6, 0xff -; GFX1064-NEXT: s_lshl_b32 s2, 0xff, s8 -; GFX1064-NEXT: s_lshl_b32 s10, s6, s8 -; GFX1064-NEXT: s_not_b32 s9, s2 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v2 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: uniform_xchg_i8: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX1032-NEXT: s_mov_b32 s9, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s4, s2, -4 -; GFX1032-NEXT: s_mov_b32 s5, s3 -; GFX1032-NEXT: s_and_b32 s2, s2, 3 -; GFX1032-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX1032-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1032-NEXT: s_and_b32 s6, s6, 0xff -; GFX1032-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1032-NEXT: s_lshl_b32 s8, s6, s2 -; GFX1032-NEXT: s_not_b32 s3, s3 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v2 -; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: uniform_xchg_i8: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_and_b32 s4, s2, -4 -; GFX1164-NEXT: s_mov_b32 s5, s3 -; GFX1164-NEXT: s_and_b32 s2, s2, 3 -; GFX1164-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1164-NEXT: s_lshl_b32 s8, s2, 3 -; GFX1164-NEXT: s_and_b32 s6, s6, 0xff -; GFX1164-NEXT: s_lshl_b32 s2, 0xff, s8 -; GFX1164-NEXT: s_lshl_b32 s10, s6, s8 -; GFX1164-NEXT: s_not_b32 s9, s2 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v2 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: uniform_xchg_i8: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1132-NEXT: s_mov_b32 s9, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_and_b32 s4, s2, -4 -; GFX1132-NEXT: s_mov_b32 s5, s3 -; GFX1132-NEXT: s_and_b32 s2, s2, 3 -; GFX1132-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1132-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1132-NEXT: s_and_b32 s6, s6, 0xff -; GFX1132-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1132-NEXT: s_lshl_b32 s8, s6, s2 -; GFX1132-NEXT: s_not_b32 s3, s3 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX1132-NEXT: s_endpgm -; -; GFX1264-LABEL: uniform_xchg_i8: -; GFX1264: ; %bb.0: -; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_and_b32 s4, s2, -4 -; GFX1264-NEXT: s_mov_b32 s5, s3 -; GFX1264-NEXT: s_and_b32 s2, s2, 3 -; GFX1264-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1264-NEXT: s_lshl_b32 s8, s2, 3 -; GFX1264-NEXT: s_and_b32 s6, s6, 0xff -; GFX1264-NEXT: s_lshl_b32 s2, 0xff, s8 -; GFX1264-NEXT: s_lshl_b32 s10, s6, s8 -; GFX1264-NEXT: s_not_b32 s9, s2 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1264-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v2 -; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1264-NEXT: s_endpgm -; -; GFX1232-LABEL: uniform_xchg_i8: -; GFX1232: ; %bb.0: -; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, 0 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_and_b32 s4, s2, -4 -; GFX1232-NEXT: s_mov_b32 s5, s3 -; GFX1232-NEXT: s_and_b32 s2, s2, 3 -; GFX1232-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1232-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1232-NEXT: s_and_b32 s6, s6, 0xff -; GFX1232-NEXT: s_lshl_b32 s3, 0xff, s2 -; GFX1232-NEXT: s_lshl_b32 s8, s6, s2 -; GFX1232-NEXT: s_not_b32 s3, s3 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mov_b32_e32 v1, s7 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: s_cbranch_execnz .LBB14_1 -; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null -; GFX1232-NEXT: s_endpgm - %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1 - store i8 %rmw, ptr addrspace(1) %result - ret void -} - -define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) { -; GFX7LESS-LABEL: uniform_or_i16: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB15_2 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s8, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_and_b32 s7, s6, 0xffff -; GFX7LESS-NEXT: s_lshl_b32 s7, s7, s2 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s7 -; GFX7LESS-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_bfe_u32 v0, v0, s2, 16 -; GFX7LESS-NEXT: .LBB15_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: uniform_or_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB15_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s8, s2, -4 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s2 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX8-NEXT: .LBB15_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: uniform_or_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s2, -4 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s6 -; GFX9-NEXT: s_lshl_b32 s3, s3, s2 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: uniform_or_i16: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB15_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s7, s2, 3 -; GFX1064-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1064-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1064-NEXT: s_and_b32 s8, s2, -4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s9 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1064-NEXT: .LBB15_2: -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: uniform_or_i16: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB15_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s5, s2, 3 -; GFX1032-NEXT: s_and_b32 s7, 0xffff, s6 -; GFX1032-NEXT: s_lshl_b32 s5, s5, 3 -; GFX1032-NEXT: s_and_b32 s8, s2, -4 -; GFX1032-NEXT: s_lshl_b32 s7, s7, s5 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s7 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0 -; GFX1032-NEXT: .LBB15_2: -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo -; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-TRUE16-LABEL: uniform_or_i16: -; GFX1164-TRUE16: ; %bb.0: -; GFX1164-TRUE16-NEXT: s_clause 0x1 -; GFX1164-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1164-TRUE16-NEXT: ; %bb.1: -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-TRUE16-NEXT: .LBB15_2: -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1164-TRUE16-NEXT: s_endpgm -; -; GFX1164-FAKE16-LABEL: uniform_or_i16: -; GFX1164-FAKE16: ; %bb.0: -; GFX1164-FAKE16-NEXT: s_clause 0x1 -; GFX1164-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1164-FAKE16-NEXT: ; %bb.1: -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-FAKE16-NEXT: .LBB15_2: -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1164-FAKE16-NEXT: s_endpgm -; -; GFX1132-TRUE16-LABEL: uniform_or_i16: -; GFX1132-TRUE16: ; %bb.0: -; GFX1132-TRUE16-NEXT: s_clause 0x1 -; GFX1132-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1132-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1132-TRUE16-NEXT: ; %bb.1: -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-TRUE16-NEXT: .LBB15_2: -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1132-TRUE16-NEXT: s_endpgm -; -; GFX1132-FAKE16-LABEL: uniform_or_i16: -; GFX1132-FAKE16: ; %bb.0: -; GFX1132-FAKE16-NEXT: s_clause 0x1 -; GFX1132-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1132-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1132-FAKE16-NEXT: ; %bb.1: -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-FAKE16-NEXT: .LBB15_2: -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1132-FAKE16-NEXT: s_endpgm -; -; GFX1264-TRUE16-LABEL: uniform_or_i16: -; GFX1264-TRUE16: ; %bb.0: -; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1264-TRUE16-NEXT: ; %bb.1: -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-TRUE16-NEXT: .LBB15_2: -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1264-TRUE16-NEXT: s_endpgm -; -; GFX1264-FAKE16-LABEL: uniform_or_i16: -; GFX1264-FAKE16: ; %bb.0: -; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1264-FAKE16-NEXT: ; %bb.1: -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-FAKE16-NEXT: .LBB15_2: -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1264-FAKE16-NEXT: s_endpgm -; -; GFX1232-TRUE16-LABEL: uniform_or_i16: -; GFX1232-TRUE16: ; %bb.0: -; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1232-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1232-TRUE16-NEXT: ; %bb.1: -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-TRUE16-NEXT: .LBB15_2: -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1232-TRUE16-NEXT: s_endpgm -; -; GFX1232-FAKE16-LABEL: uniform_or_i16: -; GFX1232-FAKE16: ; %bb.0: -; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1232-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX1232-FAKE16-NEXT: ; %bb.1: -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-FAKE16-NEXT: .LBB15_2: -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2 - store i16 %rmw, ptr addrspace(1) %result - ret void -} - -define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) { -; GFX7LESS-LABEL: uniform_add_i16: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s10, s[4:5], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB16_4 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s4, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s5, s3 -; GFX7LESS-NEXT: s_and_b32 s11, s2, 3 -; GFX7LESS-NEXT: s_mov_b64 s[2:3], 0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mul_i32 s6, s10, s6 -; GFX7LESS-NEXT: s_lshl_b32 s11, s11, 3 -; GFX7LESS-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX7LESS-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX7LESS-NEXT: s_and_b32 s6, s6, 0xffff -; GFX7LESS-NEXT: s_not_b32 s13, s12 -; GFX7LESS-NEXT: s_lshl_b32 s14, s6, s11 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v2, s13, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 -; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: v_bfe_u32 v0, v2, s11, 16 -; GFX7LESS-NEXT: .LBB16_4: ; %Flow -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xffff -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0 -; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: uniform_add_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s10, s[4:5], 0x34 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX8-NEXT: s_cbranch_execz .LBB16_4 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s6, s10, s4 -; GFX8-NEXT: s_and_b32 s4, s2, -4 -; GFX8-NEXT: s_mov_b32 s5, s3 -; GFX8-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_lshl_b32 s11, s2, 3 -; GFX8-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX8-NEXT: s_and_b32 s2, s6, 0xffff -; GFX8-NEXT: s_not_b32 s13, s12 -; GFX8-NEXT: s_lshl_b32 s14, s2, s11 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1 -; GFX8-NEXT: v_and_b32_e32 v2, s13, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB16_2 -; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX8-NEXT: .LBB16_4: ; %Flow -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_mad_u16 v0, s10, v4, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: uniform_add_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s10, s[4:5], 0x34 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_4 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s6, s10, s4 -; GFX9-NEXT: s_and_b32 s4, s2, -4 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s11, s2, 3 -; GFX9-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX9-NEXT: s_and_b32 s2, s6, 0xffff -; GFX9-NEXT: s_not_b32 s13, s12 -; GFX9-NEXT: s_lshl_b32 s14, s2, s11 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_u32_e32 v0, s14, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB16_2 -; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX9-NEXT: .LBB16_4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v4, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: uniform_add_i16: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[4:5], 0x34 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_4 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s4, s2, -4 -; GFX1064-NEXT: s_mov_b32 s5, s3 -; GFX1064-NEXT: s_and_b32 s2, s2, 3 -; GFX1064-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1064-NEXT: s_mul_i32 s2, s10, s6 -; GFX1064-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX1064-NEXT: s_and_b32 s2, s2, 0xffff -; GFX1064-NEXT: s_not_b32 s13, s12 -; GFX1064-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v2 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1064-NEXT: .LBB16_4: ; %Flow -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: uniform_add_i16: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s8, s[4:5], 0x34 -; GFX1032-NEXT: s_mov_b32 s6, exec_lo -; GFX1032-NEXT: s_mov_b32 s10, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_4 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s4, s2, -4 -; GFX1032-NEXT: s_mov_b32 s5, s3 -; GFX1032-NEXT: s_and_b32 s2, s2, 3 -; GFX1032-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1032-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1032-NEXT: s_mul_i32 s6, s8, s6 -; GFX1032-NEXT: s_lshl_b32 s3, 0xffff, s2 -; GFX1032-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1032-NEXT: s_not_b32 s11, s3 -; GFX1032-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v2 -; GFX1032-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 -; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1032-NEXT: .LBB16_4: ; %Flow -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-TRUE16-LABEL: uniform_add_i16: -; GFX1164-TRUE16: ; %bb.0: -; GFX1164-TRUE16-NEXT: s_clause 0x1 -; GFX1164-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-TRUE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1164-TRUE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-TRUE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1164-TRUE16-NEXT: ; %bb.1: -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1164-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1164-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1164-TRUE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-TRUE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1164-TRUE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX1164-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX1164-TRUE16-NEXT: s_not_b32 s13, s12 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 -; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1164-TRUE16-NEXT: s_endpgm -; -; GFX1164-FAKE16-LABEL: uniform_add_i16: -; GFX1164-FAKE16: ; %bb.0: -; GFX1164-FAKE16-NEXT: s_clause 0x1 -; GFX1164-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-FAKE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1164-FAKE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-FAKE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1164-FAKE16-NEXT: ; %bb.1: -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1164-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1164-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1164-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1164-FAKE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-FAKE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1164-FAKE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX1164-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX1164-FAKE16-NEXT: s_not_b32 s13, s12 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1164-FAKE16-NEXT: .LBB16_4: ; %Flow -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 -; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1164-FAKE16-NEXT: s_endpgm -; -; GFX1132-TRUE16-LABEL: uniform_add_i16: -; GFX1132-TRUE16: ; %bb.0: -; GFX1132-TRUE16-NEXT: s_clause 0x1 -; GFX1132-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1132-TRUE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1132-TRUE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1132-TRUE16-NEXT: ; %bb.1: -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1132-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1132-TRUE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1132-TRUE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1132-TRUE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s3, 0xffff, s2 -; GFX1132-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1132-TRUE16-NEXT: s_not_b32 s11, s3 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 -; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1132-TRUE16-NEXT: s_endpgm -; -; GFX1132-FAKE16-LABEL: uniform_add_i16: -; GFX1132-FAKE16: ; %bb.0: -; GFX1132-FAKE16-NEXT: s_clause 0x1 -; GFX1132-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1132-FAKE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1132-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1132-FAKE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1132-FAKE16-NEXT: ; %bb.1: -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1132-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1132-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1132-FAKE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1132-FAKE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1132-FAKE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s3, 0xffff, s2 -; GFX1132-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1132-FAKE16-NEXT: s_not_b32 s11, s3 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1132-FAKE16-NEXT: .LBB16_4: ; %Flow -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 -; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX1132-FAKE16-NEXT: s_endpgm -; -; GFX1264-TRUE16-LABEL: uniform_add_i16: -; GFX1264-TRUE16: ; %bb.0: -; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1264-TRUE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1264-TRUE16-NEXT: ; %bb.1: -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1264-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1264-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1264-TRUE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-TRUE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1264-TRUE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX1264-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX1264-TRUE16-NEXT: s_not_b32 s13, s12 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1264-TRUE16-NEXT: s_endpgm -; -; GFX1264-FAKE16-LABEL: uniform_add_i16: -; GFX1264-FAKE16: ; %bb.0: -; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1264-FAKE16-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-FAKE16-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1264-FAKE16-NEXT: ; %bb.1: -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1264-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1264-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1264-FAKE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-FAKE16-NEXT: s_lshl_b32 s11, s2, 3 -; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1264-FAKE16-NEXT: s_mul_i32 s2, s10, s6 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s12, 0xffff, s11 -; GFX1264-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX1264-FAKE16-NEXT: s_not_b32 s13, s12 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s14, s2, s11 -; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 -; GFX1264-FAKE16-NEXT: .LBB16_4: ; %Flow -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 -; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1264-FAKE16-NEXT: s_endpgm -; -; GFX1232-TRUE16-LABEL: uniform_add_i16: -; GFX1232-TRUE16: ; %bb.0: -; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1232-TRUE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1232-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1232-TRUE16-NEXT: ; %bb.1: -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1232-TRUE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1232-TRUE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1232-TRUE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s3, 0xffff, s2 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1232-TRUE16-NEXT: s_not_b32 s11, s3 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1232-TRUE16-NEXT: s_endpgm -; -; GFX1232-FAKE16-LABEL: uniform_add_i16: -; GFX1232-FAKE16: ; %bb.0: -; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1232-FAKE16-NEXT: s_mov_b32 s6, exec_lo -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX1232-FAKE16-NEXT: ; %bb.1: -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_and_b32 s4, s2, -4 -; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s3 -; GFX1232-FAKE16-NEXT: s_and_b32 s2, s2, 3 -; GFX1232-FAKE16-NEXT: s_load_b32 s7, s[4:5], 0x0 -; GFX1232-FAKE16-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s2, s2, 3 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_mul_i32 s6, s8, s6 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s3, 0xffff, s2 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1232-FAKE16-NEXT: s_not_b32 s11, s3 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 -; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start -; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 -; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 -; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 -; GFX1232-FAKE16-NEXT: .LBB16_4: ; %Flow -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 -; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null -; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2 - store i16 %rmw, ptr addrspace(1) %result - ret void -} - define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) { ; GFX7LESS-LABEL: uniform_xchg_i16: ; GFX7LESS: ; %bb.0: @@ -10150,7 +7136,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, s2, v0 @@ -10163,7 +7149,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -10191,7 +7177,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_and_b32_e32 v0, s9, v1 ; GFX8-NEXT: v_or_b32_e32 v0, s10, v0 @@ -10203,7 +7189,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -10231,7 +7217,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v0, s9, v1 ; GFX9-NEXT: v_or_b32_e32 v0, s10, v0 @@ -10243,7 +7229,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -10272,7 +7258,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 @@ -10283,7 +7269,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -10312,7 +7298,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 @@ -10323,7 +7309,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -10352,7 +7338,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_and_or_b32 v0, v1, s9, s10 @@ -10365,7 +7351,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -10394,7 +7380,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_and_or_b32 v0, v1, s3, s8 @@ -10406,7 +7392,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -10435,7 +7421,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1264-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-NEXT: v_and_or_b32 v0, v1, s9, s10 @@ -10448,7 +7434,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1264-NEXT: s_cbranch_execnz .LBB12_1 ; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -10477,7 +7463,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1232-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-NEXT: v_and_or_b32 v0, v1, s3, s8 @@ -10489,7 +7475,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1232-NEXT: s_cbranch_execnz .LBB12_1 ; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -10497,7 +7483,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm - %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2 + %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0 store i16 %rmw, ptr addrspace(1) %result ret void } @@ -10523,7 +7509,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -10541,7 +7527,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -10567,7 +7553,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: s_not_b32 s2, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_add_f16_e32 v0, s11, v0 @@ -10582,7 +7568,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -10608,7 +7594,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX9-NEXT: v_add_f16_e32 v0, s11, v0 @@ -10622,7 +7608,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: s_cbranch_execnz .LBB13_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -10649,7 +7635,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v1 ; GFX1064-NEXT: v_add_f16_e32 v0, s8, v0 @@ -10663,7 +7649,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1064-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v2 @@ -10690,7 +7676,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s6 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX1032-NEXT: v_add_f16_e32 v0, s8, v0 @@ -10704,7 +7690,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -10732,7 +7718,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1164-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 @@ -10751,7 +7737,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 @@ -10779,7 +7765,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1164-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 @@ -10798,7 +7784,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 @@ -10826,7 +7812,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1132-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -10844,7 +7830,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -10872,7 +7858,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1132-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -10890,7 +7876,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -10918,7 +7904,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1264-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 @@ -10937,7 +7923,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 @@ -10965,7 +7951,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1264-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 @@ -10984,7 +7970,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 @@ -11012,7 +7998,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1232-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -11030,7 +8016,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11058,7 +8044,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX1232-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -11076,7 +8062,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11084,7 +8070,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, half %val monotonic, align 2 + %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, half %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0 store half %rmw, ptr addrspace(1) %result ret void } @@ -11110,7 +8096,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -11128,7 +8114,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -11155,7 +8141,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11177,7 +8163,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -11205,7 +8191,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 @@ -11224,7 +8210,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -11252,7 +8238,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX1064-NEXT: v_add_f32_e32 v0, s10, v0 @@ -11271,7 +8257,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -11299,7 +8285,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_sdwa v0, s2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX1032-NEXT: v_add_f32_e32 v0, s9, v0 @@ -11318,7 +8304,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 ; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11347,7 +8333,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-TRUE16-NEXT: .p2align 6 -; GFX1164-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1164-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 @@ -11375,7 +8361,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -11404,7 +8390,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-FAKE16-NEXT: .p2align 6 -; GFX1164-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1164-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 @@ -11431,7 +8417,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -11460,7 +8446,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-TRUE16-NEXT: .p2align 6 -; GFX1132-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1132-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -11487,7 +8473,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-TRUE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11516,7 +8502,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-FAKE16-NEXT: .p2align 6 -; GFX1132-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1132-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -11542,7 +8528,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-FAKE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11570,7 +8556,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1264-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 @@ -11599,7 +8585,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -11627,7 +8613,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1264-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 @@ -11655,7 +8641,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 @@ -11683,7 +8669,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1232-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -11711,7 +8697,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-TRUE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11739,7 +8725,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX1232-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 @@ -11766,7 +8752,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 @@ -11774,7 +8760,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, bfloat %val monotonic, align 2 + %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, bfloat %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0 store bfloat %rmw, ptr addrspace(1) %result ret void } @@ -11802,7 +8788,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 -; GFX7LESS-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11827,7 +8813,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -11853,7 +8839,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_mov_b32 s5, s3 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_add_f16_e32 v2, s10, v1 @@ -11867,7 +8853,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -11888,7 +8874,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_pk_add_f16 v0, v1, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 @@ -11899,7 +8885,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-NEXT: s_cbranch_execnz .LBB20_1 +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -11921,7 +8907,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_pk_add_f16 v0, v1, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 @@ -11932,7 +8918,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 ; GFX1064-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX1064-NEXT: s_cbranch_execnz .LBB20_1 +; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -11954,7 +8940,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_pk_add_f16 v0, v1, s8 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 @@ -11965,7 +8951,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: s_cbranch_execnz .LBB20_1 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -11987,7 +8973,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_pk_add_f16 v0, v1, s10 @@ -12000,7 +8986,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1164-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[8:9] -; GFX1164-NEXT: s_cbranch_execnz .LBB20_1 +; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -12022,7 +9008,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_pk_add_f16 v0, v1, s8 @@ -12034,7 +9020,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: s_cbranch_execnz .LBB20_1 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -12045,72 +9031,32 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1264-LABEL: uniform_fadd_v2f16: ; GFX1264: ; %bb.0: ; GFX1264-NEXT: s_clause 0x1 +; GFX1264-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-NEXT: s_load_b32 s10, s[4:5], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], 0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1264-NEXT: s_mov_b32 s5, s3 +; GFX1264-NEXT: v_mov_b32_e32 v0, 0 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mov_b32_e32 v1, s4 -; GFX1264-NEXT: s_mov_b32 s4, s2 -; GFX1264-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_pk_add_f16 v0, v1, s10 -; GFX1264-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[8:9] -; GFX1264-NEXT: s_cbranch_execnz .LBB20_1 -; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b32 v2, off, s[0:3], null +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: uniform_fadd_v2f16: ; GFX1232: ; %bb.0: ; GFX1232-NEXT: s_clause 0x1 +; GFX1232-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-NEXT: s_load_b32 s8, s[4:5], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, 0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1232-NEXT: s_mov_b32 s5, s3 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 -; GFX1232-NEXT: s_mov_b32 s4, s2 -; GFX1232-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_pk_add_f16 v0, v1, s8 -; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9 -; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: s_cbranch_execnz .LBB20_1 -; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1232-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX1232-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b32 v2, off, s[0:3], null +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm - %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4 + %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4, !amdgpu.no.fine.grained.memory !0 store <2 x half> %rmw, ptr addrspace(1) %result ret void } @@ -12138,7 +9084,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 -; GFX7LESS-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12160,7 +9106,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -12187,7 +9133,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX8-NEXT: s_mov_b32 s5, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 @@ -12215,7 +9161,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_mov_b32 s11, 0xf000 @@ -12240,7 +9186,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_mov_b32 s4, s10 ; GFX9-NEXT: s_mov_b32 s5, s11 -; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 @@ -12265,7 +9211,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB21_1 +; GFX9-NEXT: s_cbranch_execnz .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b32 s11, 0xf000 @@ -12289,7 +9235,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: s_mov_b32 s5, s11 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s1 -; GFX1064-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX1064-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 @@ -12314,7 +9260,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB21_1 +; GFX1064-NEXT: s_cbranch_execnz .LBB16_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -12338,7 +9284,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: s_mov_b32 s4, s10 -; GFX1032-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX1032-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 @@ -12363,7 +9309,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 ; GFX1032-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execnz .LBB21_1 +; GFX1032-NEXT: s_cbranch_execnz .LBB16_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -12389,7 +9335,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX1164-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-TRUE16-NEXT: .p2align 6 -; GFX1164-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX1164-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12422,7 +9368,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] @@ -12449,7 +9395,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s1 ; GFX1164-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-FAKE16-NEXT: .p2align 6 -; GFX1164-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX1164-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12480,7 +9426,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] @@ -12507,7 +9453,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX1132-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-TRUE16-NEXT: .p2align 6 -; GFX1132-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX1132-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12539,7 +9485,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8 -; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -12566,7 +9512,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: s_mov_b32 s4, s10 ; GFX1132-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-FAKE16-NEXT: .p2align 6 -; GFX1132-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX1132-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -12596,7 +9542,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -12605,237 +9551,41 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], 0 ; GFX1132-FAKE16-NEXT: s_endpgm ; -; GFX1264-TRUE16-LABEL: uniform_fadd_v2bf16: -; GFX1264-TRUE16: ; %bb.0: -; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], 0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_load_b32 s5, s[2:3], 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s10, s4, 0xffff0000 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s11, s4, 16 -; GFX1264-TRUE16-NEXT: s_mov_b32 s4, s2 -; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s5 -; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1264-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX1264-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2 -; GFX1264-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX1264-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1264-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1264-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1264-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 -; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null -; GFX1264-TRUE16-NEXT: s_endpgm -; -; GFX1264-FAKE16-LABEL: uniform_fadd_v2bf16: -; GFX1264-FAKE16: ; %bb.0: -; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x34 -; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_load_b32 s1, s[10:11], 0x0 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s12, s0, 16 -; GFX1264-FAKE16-NEXT: s_and_b32 s13, s0, 0xffff0000 -; GFX1264-FAKE16-NEXT: s_mov_b32 s4, s10 -; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s11 -; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s1 -; GFX1264-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX1264-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2 -; GFX1264-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX1264-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 -; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null -; GFX1264-FAKE16-NEXT: s_endpgm -; -; GFX1232-TRUE16-LABEL: uniform_fadd_v2bf16: -; GFX1232-TRUE16: ; %bb.0: -; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX1232-TRUE16-NEXT: s_mov_b32 s8, 0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_load_b32 s5, s[2:3], 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s9, s4, 0xffff0000 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s10, s4, 16 -; GFX1232-TRUE16-NEXT: s_mov_b32 s4, s2 -; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s5 -; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s3 -; GFX1232-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX1232-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_add_f32_e32 v2, s9, v2 -; GFX1232-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX1232-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1232-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1232-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1232-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8 -; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8 -; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 -; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null -; GFX1232-TRUE16-NEXT: s_endpgm +; GFX1264-LABEL: uniform_fadd_v2bf16: +; GFX1264: ; %bb.0: +; GFX1264-NEXT: s_clause 0x1 +; GFX1264-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1264-NEXT: v_mov_b32_e32 v0, 0 +; GFX1264-NEXT: s_wait_kmcnt 0x0 +; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: global_atomic_pk_add_bf16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1264-NEXT: s_endpgm ; -; GFX1232-FAKE16-LABEL: uniform_fadd_v2bf16: -; GFX1232-FAKE16: ; %bb.0: -; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x34 -; GFX1232-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_load_b32 s4, s[10:11], 0x0 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s2, s0, 16 -; GFX1232-FAKE16-NEXT: s_and_b32 s3, s0, 0xffff0000 -; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s11 -; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s4 -; GFX1232-FAKE16-NEXT: s_mov_b32 s4, s10 -; GFX1232-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX1232-FAKE16-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_add_f32_e32 v2, s3, v2 -; GFX1232-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX1232-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo -; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX1232-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 -; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null -; GFX1232-FAKE16-NEXT: s_endpgm - %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4 +; GFX1232-LABEL: uniform_fadd_v2bf16: +; GFX1232: ; %bb.0: +; GFX1232-NEXT: s_clause 0x1 +; GFX1232-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_wait_kmcnt 0x0 +; GFX1232-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX1232-NEXT: global_atomic_pk_add_bf16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1232-NEXT: s_endpgm + %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4, !amdgpu.no.fine.grained.memory !0 store <2 x bfloat> %rmw, ptr addrspace(1) %result ret void } + +!0 = !{} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX1132_DPP-FAKE16: {{.*}} ; GFX1132_DPP-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll index 366432e0fc6cb..6ec6dce460c01 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -223,16 +223,16 @@ define amdgpu_kernel void @kernel_explicit_worst_case() #9 { attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll index 93ebeafe0dee5..fcca3d705490d 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll @@ -13,13 +13,13 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast( ; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]] ; GFX9-NEXT: ret void ; ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast( ; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]] ; GFX10-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr @@ -31,13 +31,13 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel( ; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { ; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0]] ; GFX9-NEXT: ret void ; ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel( ; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { ; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0]] ; GFX10-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr @@ -147,9 +147,13 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % attributes #0 = { "amdgpu-no-flat-scratch-init" } ;. -; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; GFX9: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" } ;. -; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } ; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" } ;. +; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10} +;. +; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 10} +;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index 94f670e38683b..9acf60f37f289 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -841,17 +841,17 @@ define amdgpu_kernel void @with_inline_asm() { } ;. -; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; GFX9: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" } -; GFX9: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ;. -; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } ; GFX10: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" } -; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll index 2b9f579e6a183..a9efcdcb0af6d 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll @@ -63,5 +63,5 @@ define amdgpu_kernel void @entry() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll b/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll index ac5f9b6b483eb..ad26dfa7f93e8 100644 --- a/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll @@ -105,9 +105,8 @@ define i64 @v_xor_i64_known_i32_from_range_use_out_of_block(i64 %x) { ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: ; %bb.1: ; %inc ; CHECK-NEXT: v_not_b32_e32 v2, v4 -; CHECK-NEXT: v_not_b32_e32 v3, 0 ; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 9a3e740b344e6..05ca69e435c60 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -768,10 +768,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir new file mode 100644 index 0000000000000..d0c9740c6954e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -0,0 +1,15 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s +--- +name: snork +body: | + bb.0: + ; CHECK-LABEL: name: snork + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: SI_RETURN + %0:sreg_32 = S_MOV_B32 0 + %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 + %2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc + SI_RETURN +... diff --git a/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir b/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir new file mode 100644 index 0000000000000..72b6b9f9ec686 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -run-pass=si-lower-sgpr-spills,greedy,si-lower-wwm-copies,virtregrewriter,prologepilog -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: widget +tracksRegLiveness: true +frameInfo: + adjustsStack: true +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } + - { id: 1, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: widget + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $agpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr0 + ; GCN-NEXT: frame-setup CFI_INSTRUCTION undefined $sgpr14 + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $agpr0, 0 + ; GCN-NEXT: $exec = S_MOV_B64 -1 + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GCN-NEXT: frame-setup CFI_INSTRUCTION offset $vgpr62, 256 + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: renamable $vgpr62 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr15, 0, killed $vgpr62 + ; GCN-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: renamable $agpr0 = COPY killed renamable $vgpr62 + ; GCN-NEXT: $exec = S_MOV_B64 killed $noreg + ; GCN-NEXT: renamable $vgpr62 = IMPLICIT_DEF + ; GCN-NEXT: dead renamable $vgpr62 = V_AND_B32_e32 1, killed $vgpr62, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $agpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; GCN-NEXT: liveins: $agpr0, $sgpr86, $sgpr87, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr80_sgpr81, $sgpr82_sgpr83, $sgpr84_sgpr85, $sgpr96_sgpr97, $sgpr98_sgpr99 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $agpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: renamable $vgpr62 = COPY renamable $agpr0 + ; GCN-NEXT: $exec = S_MOV_B64 killed $noreg + ; GCN-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr62, 1 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GCN-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec + ; GCN-NEXT: $exec = S_MOV_B64 -1 + ; GCN-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN + bb.0: + liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15 + + %45:vgpr_32 = IMPLICIT_DEF + SI_SPILL_S32_SAVE $sgpr15, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + %16:vgpr_32 = V_AND_B32_e32 1, %45, implicit $exec + + bb.1: + successors: %bb.3, %bb.2 + + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.3 + + bb.2: + successors: %bb.4(0x04000000), %bb.1(0x7c000000) + liveins: $sgpr86, $sgpr87, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr80_sgpr81, $sgpr82_sgpr83, $sgpr84_sgpr85, $sgpr96_sgpr97, $sgpr98_sgpr99 + + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.4 + + bb.3: + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $sgpr14 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ADJCALLSTACKDOWN 0, 28, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + S_BRANCH %bb.2 + + bb.4: + SI_RETURN + +... diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 9a98a7cd01ed4..12de3750640db 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -42,7 +42,7 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -64,7 +64,7 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -86,7 +86,7 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -108,7 +108,7 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -144,7 +144,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -166,7 +166,7 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic + %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -188,7 +188,7 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -210,7 +210,7 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic + %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -232,7 +232,7 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -254,7 +254,7 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic + %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -276,7 +276,7 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -298,7 +298,7 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic + %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -320,7 +320,7 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -388,7 +388,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -410,7 +410,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) % ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic + %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -446,7 +446,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic + %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n32 = fptoui float %f32 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 @@ -483,7 +483,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic + %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n32 = fptoui float %f32 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/debug-type-mutate.ll b/llvm/test/CodeGen/AMDGPU/debug-type-mutate.ll new file mode 100644 index 0000000000000..7c82cdb805c92 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/debug-type-mutate.ll @@ -0,0 +1,50 @@ +; RUN: llc -stop-after=codegenprepare < %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +@0 = addrspace(4) constant [16 x i8] c"AAAAAAAAAAAAAAAA", align 16 +@1 = addrspace(1) constant [16 x i8] c"AAAAAAAAAAAAAAAA", align 16 + +define void @func1(i32 %a0, i8 %a1, ptr %a2) #0 { +; CHECK: define void @func1(i32 %a0, i8 %a1, ptr %a2) #0 { +; CHECK-NEXT: %promoted = zext i32 %a0 to i64 +; CHECK-NEXT: %vl0 = lshr i64 %promoted, 12 +; CHECK-NEXT: #dbg_value(!DIArgList(i32 0, i64 %vl0), !4, !DIExpression(DIOpArg(1, i64), DIOpConvert(i32), DIOpConvert(i8), DIOpFragment(24, 8)), !9) + %vl0 = lshr i32 %a0, 12 + #dbg_value(!DIArgList(i32 0, i32 %vl0), !4, !DIExpression(DIOpArg(1, i32), DIOpConvert(i8), DIOpFragment(24, 8)), !9) + %op0 = zext nneg i32 %vl0 to i64 + %op1 = getelementptr inbounds nuw i8, ptr addrspace(4) @0, i64 %op0 + %op2 = load i8, ptr addrspace(4) %op1, align 1 + store i8 %op2, ptr %a2, align 1 + ret void +} + +define void @func2(i32 %a0, i8 %a1, ptr %a2) #0 { +; CHECK: define void @func2(i32 %a0, i8 %a1, ptr %a2) #0 { +; CHECK-NEXT: %vl0 = lshr i32 %a0, 12 +; CHECK-NEXT: #dbg_value(!DIArgList(i32 0, i32 %vl0), !4, !DIExpression(DIOpArg(1, i32), DIOpConvert(i8), DIOpFragment(24, 8)), !9) + %vl0 = lshr i32 %a0, 12 + #dbg_value(!DIArgList(i32 0, i32 %vl0), !4, !DIExpression(DIOpArg(1, i32), DIOpConvert(i8), DIOpFragment(24, 8)), !9) + %op0 = zext nneg i32 %vl0 to i64 + %op1 = getelementptr inbounds nuw i8, ptr addrspace(1) @1, i64 %op0 + %op2 = load i8, ptr addrspace(1) %op1, align 1 + store i8 %op2, ptr %a2, align 1 + ret void +} + + +attributes #0 = { "target-cpu"="gfx1201" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "-", directory: "/") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocalVariable(name: "aux32", scope: !5, file: !1, line: 1757, type: !8) +!5 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 1754, type: !6, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{null} +!8 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!9 = !DILocation(line: 0, scope: !5) diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 0ba3c22a5d6a1..312845e9dc4a9 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -34,6 +34,6 @@ define amdgpu_kernel void @test_direct_indirect_call() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 132202da33663..866c0db197617 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -27,7 +27,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { attributes #0 = { "amdgpu-no-dispatch-id" } ;. -;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll index 19c96d709d879..fc0794c973c0a 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll @@ -154,26 +154,31 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { ; CHECK-LABEL: realign_stack: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) -; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_cmp_lg_u32 0, s33 -; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi ; CHECK-NEXT: s_cmovk_i32 s33, 0x200 -; CHECK-NEXT: s_movk_i32 s32, 0x100 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: scratch_store_b32 off, v32, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112 ; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96 -; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80 +; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112 ; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64 -; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48 +; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80 ; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32 +; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48 ; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 ; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_movk_i32 s32, 0x100 ; CHECK-NEXT: s_cmovk_i32 s32, 0x300 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] ; CHECK-NEXT: s_alloc_vgpr 0 ; CHECK-NEXT: s_endpgm %v = alloca <32 x i32>, align 128, addrspace(5) + ; use volatile store to avoid promotion of alloca to registers + store volatile i32 0, ptr addrspace(5) %v store <32 x i32> %x, ptr addrspace(5) %v call amdgpu_gfx void @callee(i32 71) ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b5e579b78a59c..69a0067530486 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3004 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3804 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 +; GFX10-NEXT: s_add_i32 s0, s0, 4 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004 +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804 +; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804 +; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 +; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s32, 0x3004 +; GFX9-NEXT: s_add_i32 s0, s1, 4 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 s0, s32, 0x3804 +; GFX10-NEXT: s_add_i32 s1, s32, s0 +; GFX10-NEXT: s_add_i32 s0, s1, 4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004 +; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0 +; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 3304dbf3eaa3d..57be2907da4a0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -805,7 +805,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -879,7 +879,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -962,7 +962,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -1114,7 +1114,7 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -1413,7 +1413,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1487,7 +1487,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1662,7 +1662,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -1722,7 +1722,7 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1791,7 +1791,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -1869,7 +1869,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2017,7 +2017,7 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2090,7 +2090,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2169,7 +2169,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2316,7 +2316,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2384,7 +2384,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2458,7 +2458,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2544,7 +2544,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2605,7 +2605,7 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2678,7 +2678,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2757,7 +2757,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2848,7 +2848,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -2904,7 +2904,7 @@ define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2972,7 +2972,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3046,7 +3046,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3193,7 +3193,7 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3266,7 +3266,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3345,7 +3345,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3436,7 +3436,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3492,7 +3492,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3560,7 +3560,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3634,7 +3634,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3720,7 +3720,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3781,7 +3781,7 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3854,7 +3854,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -3933,7 +3933,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4024,7 +4024,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -4080,7 +4080,7 @@ define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4148,7 +4148,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -4222,7 +4222,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4308,7 +4308,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -4373,7 +4373,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4447,7 +4447,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -4530,7 +4530,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4622,7 +4622,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -4682,7 +4682,7 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4751,7 +4751,7 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -4829,7 +4829,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4916,7 +4916,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -5653,7 +5653,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6297,7 +6297,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6371,7 +6371,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -6454,7 +6454,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6546,7 +6546,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -6606,7 +6606,7 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6675,7 +6675,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -6753,7 +6753,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6840,7 +6840,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -9126,7 +9126,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9190,7 +9190,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1023 - %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9258,7 +9258,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1024 - %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9332,7 +9332,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -9415,7 +9415,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9507,7 +9507,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -9567,7 +9567,7 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9636,7 +9636,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -9714,7 +9714,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9801,7 +9801,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -9866,7 +9866,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9930,7 +9930,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1023 - %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9998,7 +9998,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 1024 - %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10072,7 +10072,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr %out, i32 4 - %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -10155,7 +10155,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10247,7 +10247,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i32, ptr %out, i64 %index %gep = getelementptr i32, ptr %ptr, i32 4 - %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -10307,7 +10307,7 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10376,7 +10376,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -10454,7 +10454,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10541,7 +10541,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i64 %index - %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void } @@ -10851,3 +10851,5 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { store bfloat %val, ptr %out ret void } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 1311560715ddd..e74ad3d62bea4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1061,25 +1061,64 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_sub_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB30_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB30_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB30_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -1091,9 +1130,22 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB31_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset: @@ -1101,17 +1153,43 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB31_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB31_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1122,25 +1200,67 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1150,29 +1270,69 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1185,10 +1345,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB34_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1196,10 +1368,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB34_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1207,10 +1391,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB34_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -1224,10 +1420,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB35_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1237,10 +1445,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB35_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1248,10 +1468,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB35_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1264,10 +1496,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1275,10 +1521,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1286,10 +1546,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1301,12 +1575,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB37_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1314,12 +1600,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB37_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1327,10 +1625,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB37_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1411,25 +1723,64 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_and_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB40_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB40_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB40_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1441,9 +1792,22 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB41_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset: @@ -1451,17 +1815,43 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB41_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB41_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1472,25 +1862,67 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_and_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB42_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB42_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB42_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1500,29 +1932,69 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB43_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB43_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB43_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1535,10 +2007,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB44_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1546,10 +2030,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB44_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1557,10 +2053,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB44_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1574,10 +2082,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB45_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1587,10 +2107,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB45_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1598,10 +2130,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB45_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1614,10 +2158,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB46_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1625,10 +2183,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB46_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1636,10 +2208,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB46_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1651,12 +2237,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB47_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1664,12 +2262,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB47_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1677,10 +2287,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB47_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -2532,25 +3156,64 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_or_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB60_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB60_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB60_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2562,9 +3225,22 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset: @@ -2572,17 +3248,43 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2593,25 +3295,67 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_or_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB62_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB62_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB62_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2621,29 +3365,69 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB63_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB63_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB63_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2656,10 +3440,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB64_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2667,10 +3463,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB64_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2678,10 +3486,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB64_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2695,10 +3515,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2708,10 +3540,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2719,10 +3563,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB65_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2735,10 +3591,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2746,10 +3616,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2757,10 +3641,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB66_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2772,12 +3670,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2785,12 +3695,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2798,10 +3720,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2882,25 +3818,64 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_xor_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB70_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB70_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB70_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -2912,9 +3887,22 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB71_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: @@ -2922,17 +3910,43 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB71_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB71_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2943,25 +3957,67 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB72_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB72_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB72_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2971,29 +4027,69 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB73_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB73_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB73_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -3006,10 +4102,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB74_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -3017,10 +4125,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB74_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -3028,10 +4148,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB74_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -3045,10 +4177,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB75_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -3058,10 +4202,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB75_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -3069,10 +4225,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB75_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -3085,10 +4253,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB76_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -3096,10 +4278,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB76_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -3107,10 +4303,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB76_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -3122,12 +4332,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB77_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -3135,12 +4357,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB77_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -3148,10 +4382,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB77_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -4228,22 +5476,9 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -4251,43 +5486,17 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -4298,69 +5507,29 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v0, v[3:4] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_max_i32_e32 v0, v1, v2 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v0, v[3:4] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_max_i32_e32 v0, v1, v2 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5281,22 +6450,9 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -5304,43 +6460,17 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5351,69 +6481,29 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v0, v[3:4] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_max_u32_e32 v0, v1, v2 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v0, v[3:4] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_max_u32_e32 v0, v1, v2 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6022,22 +7112,9 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -6045,43 +7122,17 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6092,69 +7143,29 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v0, v[3:4] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_min_u32_e32 v0, v1, v2 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v0, v[3:4] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_min_u32_e32 v0, v1, v2 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7152,22 +8163,9 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -7175,43 +8173,17 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7222,69 +8194,29 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v0, v[3:4] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_min_i32_e32 v0, v1, v2 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v0, v[3:4] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_min_i32_e32 v0, v1, v2 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7299,25 +8231,70 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB131_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB131_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB131_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7329,9 +8306,24 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB132_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: @@ -7339,17 +8331,47 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB132_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB132_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7360,25 +8382,73 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB133_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB133_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB133_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7388,29 +8458,75 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB134_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB134_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB134_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7423,10 +8539,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB135_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -7434,10 +8564,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB135_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -7445,10 +8589,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB135_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7462,10 +8620,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB136_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -7475,10 +8647,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB136_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -7486,10 +8672,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB136_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7502,10 +8702,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB137_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -7513,10 +8729,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB137_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -7524,10 +8756,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB137_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7538,13 +8786,27 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB138_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -7552,12 +8814,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB138_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -7565,10 +8841,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB138_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7649,25 +8941,76 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB141_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB141_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB141_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7679,9 +9022,26 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB142_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset: @@ -7689,17 +9049,51 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB142_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB142_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7710,25 +9104,79 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB143_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB143_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB143_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7738,29 +9186,81 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB144_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB144_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB144_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7773,10 +9273,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB145_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -7784,10 +9301,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB145_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -7795,10 +9329,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB145_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7812,10 +9363,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB146_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -7825,10 +9393,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB146_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -7836,10 +9421,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB146_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7852,10 +9454,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v5, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB147_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7863,10 +9484,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v5, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB147_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7874,10 +9514,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v5, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB147_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7889,12 +9548,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v5, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB148_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -7902,12 +9578,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v5, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB148_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -7915,10 +9608,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v5, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB148_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index ffe0596a95e33..d9a596283db1e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -1594,7 +1594,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -1875,7 +1875,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -2157,7 +2157,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2587,7 +2587,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -2742,7 +2742,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2894,7 +2894,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -3029,7 +3029,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3169,7 +3169,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -3317,7 +3317,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3463,7 +3463,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -3604,7 +3604,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3752,7 +3752,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -3907,7 +3907,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4061,7 +4061,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -4196,7 +4196,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4338,7 +4338,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -4486,7 +4486,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4634,7 +4634,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -4775,7 +4775,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4923,7 +4923,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -5078,7 +5078,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5232,7 +5232,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -5367,7 +5367,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5509,7 +5509,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -5657,7 +5657,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -5946,7 +5946,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6094,7 +6094,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -6249,7 +6249,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6403,7 +6403,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -6538,7 +6538,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6680,7 +6680,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -6828,7 +6828,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6976,7 +6976,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -7117,7 +7117,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7265,7 +7265,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -7420,7 +7420,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7574,7 +7574,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -7709,7 +7709,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7851,7 +7851,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8147,7 +8147,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -8285,7 +8285,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8428,7 +8428,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -8580,7 +8580,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8729,7 +8729,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -8861,7 +8861,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8998,7 +8998,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -9143,7 +9143,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9286,7 +9286,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -10759,7 +10759,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10902,7 +10902,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -11054,7 +11054,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11203,7 +11203,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -11335,7 +11335,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11472,7 +11472,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -11617,7 +11617,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11760,7 +11760,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -14107,7 +14107,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14260,7 +14260,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -14422,7 +14422,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14581,7 +14581,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -14723,7 +14723,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14870,7 +14870,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -15025,7 +15025,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15178,7 +15178,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -15335,7 +15335,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15499,7 +15499,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -15670,7 +15670,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15840,7 +15840,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -15991,7 +15991,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16149,7 +16149,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } @@ -16313,7 +16313,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16477,7 +16477,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 3c1bc95cc38f6..757649ca592b3 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -458,13 +458,25 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_offset: @@ -473,13 +485,25 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_offset: @@ -501,40 +525,66 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_and_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_offset: @@ -561,40 +611,64 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_and_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_addr64_offset: @@ -624,42 +698,68 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: @@ -689,27 +789,55 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_and_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64: @@ -732,14 +860,29 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s5, v7 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -749,14 +892,29 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v5, s5, v7 +; GFX8-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -785,36 +943,60 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_and_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_addr64: @@ -843,38 +1025,64 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: @@ -906,13 +1114,26 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_offset: @@ -921,13 +1142,26 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_offset: @@ -949,40 +1183,68 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_sub_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_offset: @@ -1009,40 +1271,66 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_sub_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_addr64_offset: @@ -1072,42 +1360,70 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: @@ -1137,27 +1453,57 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_sub_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64: @@ -1180,14 +1526,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v1 +; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1197,14 +1559,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1233,36 +1611,62 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_sub_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_addr64: @@ -1291,38 +1695,66 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: @@ -1354,12 +1786,27 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_offset: @@ -1368,12 +1815,27 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_offset: @@ -1395,40 +1857,70 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_max_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_offset: @@ -1455,38 +1947,68 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64_offset: @@ -1516,42 +2038,72 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: @@ -1581,25 +2133,59 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_max_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64: @@ -1622,16 +2208,33 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -1639,16 +2242,33 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1675,34 +2295,64 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64: @@ -1731,38 +2381,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: @@ -1794,12 +2474,27 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_offset: @@ -1808,12 +2503,27 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_offset: @@ -1835,40 +2545,70 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_umax_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_offset: @@ -1895,38 +2635,68 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64_offset: @@ -1956,42 +2726,72 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -2021,25 +2821,59 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umax_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64: @@ -2062,16 +2896,33 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -2079,16 +2930,33 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2115,34 +2983,64 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_endpgm -; -; GFX8-LABEL: atomic_umax_i64_addr64: -; GFX8: ; %bb.0: ; %entry +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64: @@ -2171,38 +3069,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: @@ -2234,12 +3162,27 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_offset: @@ -2248,12 +3191,27 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_offset: @@ -2275,40 +3233,70 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_min_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_offset: @@ -2335,38 +3323,68 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64_offset: @@ -2396,42 +3414,72 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: @@ -2461,25 +3509,59 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_min_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64: @@ -2502,16 +3584,33 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -2519,16 +3618,33 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2555,34 +3671,64 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64: @@ -2611,38 +3757,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: @@ -2674,12 +3850,27 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_offset: @@ -2688,12 +3879,27 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_offset: @@ -2715,40 +3921,70 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_umin_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_offset: @@ -2775,38 +4011,68 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umin_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64_offset: @@ -2836,42 +4102,72 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: @@ -2901,25 +4197,59 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umin_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64: @@ -2942,16 +4272,33 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -2959,16 +4306,33 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2995,34 +4359,64 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umin_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64: @@ -3051,38 +4445,68 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: @@ -3114,13 +4538,25 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_offset: @@ -3129,13 +4565,25 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_offset: @@ -3157,40 +4605,66 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_or_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_or_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_or_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_offset: @@ -3217,40 +4691,64 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_or_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_addr64_offset: @@ -3280,42 +4778,68 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_or_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_or_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: @@ -3345,27 +4869,55 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_or_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64: @@ -3388,14 +4940,29 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s5, v7 +; GFX7-NEXT: v_or_b32_e32 v4, s4, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3405,14 +4972,29 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_or_b32_e32 v5, s5, v7 +; GFX8-NEXT: v_or_b32_e32 v4, s4, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3441,36 +5023,60 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_or_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_addr64: @@ -3499,38 +5105,64 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_or_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_or_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: @@ -4104,13 +5736,25 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_offset: @@ -4119,13 +5763,25 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_offset: @@ -4147,40 +5803,66 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_xor_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_offset: @@ -4207,40 +5889,64 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xor_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_addr64_offset: @@ -4269,43 +5975,69 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: @@ -4335,27 +6067,55 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_xor_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB78_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB78_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64: @@ -4378,14 +6138,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_xor_b32_e32 v5, s5, v7 +; GFX7-NEXT: v_xor_b32_e32 v4, s4, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB79_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4395,14 +6170,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_xor_b32_e32 v5, s5, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB79_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4431,36 +6221,60 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xor_i64_addr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB80_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_addr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB80_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_addr64: @@ -4489,38 +6303,64 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5 +; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB81_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB81_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: @@ -5920,13 +7760,28 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB107_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_offset: @@ -5935,13 +7790,28 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB107_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_offset: @@ -5963,40 +7833,72 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_inc_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB108_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB108_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_offset: @@ -6023,40 +7925,70 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_inc_i64_incr64_offset: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB109_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_incr64_offset: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB109_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_incr64_offset: @@ -6086,42 +8018,74 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB110_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB110_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: @@ -6151,27 +8115,61 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_inc_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB111_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB111_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64: @@ -6194,34 +8192,66 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB112_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB112_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret: @@ -6247,36 +8277,66 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_inc_i64_incr64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB113_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_incr64: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB113_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_incr64: @@ -6305,38 +8365,70 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB114_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB114_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: @@ -6364,32 +8456,70 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s0, s4, 32 +; GFX7-NEXT: s_addc_u32 s1, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB115_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s0, s4, 32 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB115_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_offset: @@ -6411,40 +8541,80 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_add_u32 s0, s8, 32 +; GFX7-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB116_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB116_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_offset: @@ -6471,40 +8641,78 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_decr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB117_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_decr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB117_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64_offset: @@ -6532,44 +8740,84 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: v_mov_b32_e32 v5, s8 +; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB118_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB118_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: @@ -6598,28 +8846,70 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_dec_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB119_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB119_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64: @@ -6640,36 +8930,76 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB120_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB120_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret: @@ -6695,36 +9025,74 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_decr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB121_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_decr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB121_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64: @@ -6751,40 +9119,80 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_ret_decr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX7-NEXT: s_add_u32 s0, s0, s4 -; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: v_mov_b32_e32 v5, s8 +; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB122_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB122_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 23dfe2f70fa7e..524100c5b7a25 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -3633,21 +3633,40 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB30_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB30_4 +; GCN1-NEXT: s_cbranch_execnz .LBB30_6 ; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB30_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB30_2 -; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB30_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3673,21 +3692,40 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB30_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB30_4 +; GCN2-NEXT: s_cbranch_execnz .LBB30_6 ; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB30_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB30_2 -; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB30_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3711,21 +3749,37 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB30_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB30_4 +; GCN3-NEXT: s_cbranch_execnz .LBB30_6 ; GCN3-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB30_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB30_2 -; GCN3-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB30_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3756,21 +3810,40 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB31_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB31_4 +; GCN1-NEXT: s_cbranch_execnz .LBB31_6 ; GCN1-NEXT: .LBB31_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB31_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB31_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB31_2 -; GCN1-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB31_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3798,21 +3871,40 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB31_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB31_4 +; GCN2-NEXT: s_cbranch_execnz .LBB31_6 ; GCN2-NEXT: .LBB31_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB31_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB31_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB31_2 -; GCN2-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB31_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3838,21 +3930,37 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB31_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB31_4 +; GCN3-NEXT: s_cbranch_execnz .LBB31_6 ; GCN3-NEXT: .LBB31_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB31_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB31_2 -; GCN3-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB31_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3877,41 +3985,56 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB32_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB32_4 -; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB32_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB32_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB32_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: .LBB32_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB32_2 -; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB32_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB32_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -3920,41 +4043,56 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB32_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB32_4 -; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB32_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB32_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB32_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: .LBB32_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB32_2 -; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB32_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB32_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -3969,21 +4107,37 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB32_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB32_4 +; GCN3-NEXT: s_cbranch_execnz .LBB32_6 ; GCN3-NEXT: .LBB32_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB32_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB32_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB32_2 -; GCN3-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB32_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4015,21 +4169,40 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB33_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB33_4 +; GCN1-NEXT: s_cbranch_execnz .LBB33_6 ; GCN1-NEXT: .LBB33_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB33_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB33_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB33_2 -; GCN1-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB33_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4058,21 +4231,40 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB33_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB33_4 +; GCN2-NEXT: s_cbranch_execnz .LBB33_6 ; GCN2-NEXT: .LBB33_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB33_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB33_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB33_2 -; GCN2-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB33_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4099,21 +4291,37 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB33_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB33_4 +; GCN3-NEXT: s_cbranch_execnz .LBB33_6 ; GCN3-NEXT: .LBB33_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB33_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB33_2 -; GCN3-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB33_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4144,21 +4352,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB34_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB34_6 ; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB34_2 -; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB34_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB34_2 +; GCN1-NEXT: .LBB34_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -4188,21 +4415,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB34_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB34_6 ; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB34_2 -; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB34_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB34_2 +; GCN2-NEXT: .LBB34_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -4229,21 +4475,35 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB34_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB34_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB34_6 ; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB34_2 -; GCN3-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB34_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB34_2 +; GCN3-NEXT: .LBB34_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -4276,21 +4536,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB35_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB35_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB35_6 ; GCN1-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB35_2 -; GCN1-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB35_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB35_2 +; GCN1-NEXT: .LBB35_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec @@ -4322,21 +4601,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB35_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB35_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB35_6 ; GCN2-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB35_2 -; GCN2-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB35_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB35_2 +; GCN2-NEXT: .LBB35_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -4365,21 +4663,35 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB35_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB35_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB35_6 ; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB35_2 -; GCN3-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB35_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB35_2 +; GCN3-NEXT: .LBB35_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -4409,20 +4721,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB36_2 +; GCN1-NEXT: s_cbranch_vccz .LBB36_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v8, v1 +; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB36_3 -; GCN1-NEXT: s_branch .LBB36_4 -; GCN1-NEXT: .LBB36_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB36_6 +; GCN1-NEXT: .LBB36_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB36_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -4438,7 +4769,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB36_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -4451,20 +4782,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB36_2 +; GCN2-NEXT: s_cbranch_vccz .LBB36_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v8, v1 +; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB36_3 -; GCN2-NEXT: s_branch .LBB36_4 -; GCN2-NEXT: .LBB36_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB36_6 +; GCN2-NEXT: .LBB36_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB36_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -4479,7 +4829,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB36_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -4490,20 +4840,34 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB36_2 +; GCN3-NEXT: s_cbranch_vccz .LBB36_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB36_3 -; GCN3-NEXT: s_branch .LBB36_4 -; GCN3-NEXT: .LBB36_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB36_6 +; GCN3-NEXT: .LBB36_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB36_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -4516,7 +4880,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB36_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst @@ -4535,20 +4899,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB37_2 +; GCN1-NEXT: s_cbranch_vccz .LBB37_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v8, v1 +; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB37_3 -; GCN1-NEXT: s_branch .LBB37_4 -; GCN1-NEXT: .LBB37_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB37_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB37_6 +; GCN1-NEXT: .LBB37_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB37_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec @@ -4564,7 +4947,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB37_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -4579,20 +4962,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB37_2 +; GCN2-NEXT: s_cbranch_vccz .LBB37_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v8, v1 +; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB37_3 -; GCN2-NEXT: s_branch .LBB37_4 -; GCN2-NEXT: .LBB37_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB37_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB37_6 +; GCN2-NEXT: .LBB37_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB37_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -4607,7 +5009,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB37_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -4620,20 +5022,34 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB37_2 +; GCN3-NEXT: s_cbranch_vccz .LBB37_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB37_3 -; GCN3-NEXT: s_branch .LBB37_4 -; GCN3-NEXT: .LBB37_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB37_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB37_6 +; GCN3-NEXT: .LBB37_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB37_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -4646,7 +5062,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB37_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -4928,21 +5344,40 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB40_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB40_4 +; GCN1-NEXT: s_cbranch_execnz .LBB40_6 ; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB40_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB40_2 -; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB40_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -4968,21 +5403,40 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB40_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB40_4 +; GCN2-NEXT: s_cbranch_execnz .LBB40_6 ; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB40_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB40_2 -; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB40_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -5006,21 +5460,37 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB40_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB40_4 +; GCN3-NEXT: s_cbranch_execnz .LBB40_6 ; GCN3-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB40_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB40_2 -; GCN3-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB40_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -5051,21 +5521,40 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB41_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB41_4 +; GCN1-NEXT: s_cbranch_execnz .LBB41_6 ; GCN1-NEXT: .LBB41_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB41_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB41_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB41_2 -; GCN1-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB41_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -5093,21 +5582,40 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB41_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB41_4 +; GCN2-NEXT: s_cbranch_execnz .LBB41_6 ; GCN2-NEXT: .LBB41_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB41_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB41_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB41_2 -; GCN2-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB41_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -5133,21 +5641,37 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB41_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB41_4 +; GCN3-NEXT: s_cbranch_execnz .LBB41_6 ; GCN3-NEXT: .LBB41_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB41_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB41_2 -; GCN3-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB41_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -5172,41 +5696,56 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_4 -; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB42_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB42_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB42_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB42_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB42_2 -; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB42_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB42_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5215,41 +5754,56 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_4 -; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB42_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB42_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB42_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB42_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB42_2 -; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB42_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB42_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5264,21 +5818,37 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB42_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB42_4 +; GCN3-NEXT: s_cbranch_execnz .LBB42_6 ; GCN3-NEXT: .LBB42_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB42_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB42_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB42_2 -; GCN3-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB42_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5310,21 +5880,40 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB43_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB43_4 +; GCN1-NEXT: s_cbranch_execnz .LBB43_6 ; GCN1-NEXT: .LBB43_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB43_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB43_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB43_2 -; GCN1-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB43_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -5353,21 +5942,40 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB43_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB43_4 +; GCN2-NEXT: s_cbranch_execnz .LBB43_6 ; GCN2-NEXT: .LBB43_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB43_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB43_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB43_2 -; GCN2-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB43_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -5394,21 +6002,37 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB43_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB43_4 +; GCN3-NEXT: s_cbranch_execnz .LBB43_6 ; GCN3-NEXT: .LBB43_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB43_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB43_2 -; GCN3-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB43_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5439,21 +6063,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB44_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB44_6 ; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB44_2 -; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB44_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB44_2 +; GCN1-NEXT: .LBB44_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -5482,21 +6124,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB44_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB44_6 ; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB44_2 -; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB44_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB44_2 +; GCN2-NEXT: .LBB44_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -5522,21 +6182,34 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB44_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB44_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB44_6 ; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB44_2 -; GCN3-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB44_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB44_2 +; GCN3-NEXT: .LBB44_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -5568,21 +6241,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB45_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB45_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB45_6 ; GCN1-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB45_2 -; GCN1-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB45_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB45_2 +; GCN1-NEXT: .LBB45_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -5613,21 +6304,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB45_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB45_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB45_6 ; GCN2-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB45_2 -; GCN2-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB45_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB45_2 +; GCN2-NEXT: .LBB45_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -5655,21 +6364,34 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB45_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB45_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB45_6 ; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB45_2 -; GCN3-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB45_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB45_2 +; GCN3-NEXT: .LBB45_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -5698,20 +6420,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB46_2 +; GCN1-NEXT: s_cbranch_vccz .LBB46_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB46_3 -; GCN1-NEXT: s_branch .LBB46_4 -; GCN1-NEXT: .LBB46_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB46_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB46_6 +; GCN1-NEXT: .LBB46_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB46_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -5726,7 +6466,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB46_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5739,20 +6479,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB46_2 +; GCN2-NEXT: s_cbranch_vccz .LBB46_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB46_3 -; GCN2-NEXT: s_branch .LBB46_4 -; GCN2-NEXT: .LBB46_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB46_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB46_6 +; GCN2-NEXT: .LBB46_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB46_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -5766,7 +6524,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB46_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5777,20 +6535,33 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB46_2 +; GCN3-NEXT: s_cbranch_vccz .LBB46_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB46_3 -; GCN3-NEXT: s_branch .LBB46_4 -; GCN3-NEXT: .LBB46_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB46_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB46_6 +; GCN3-NEXT: .LBB46_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB46_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -5802,7 +6573,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB46_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst @@ -5821,20 +6592,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB47_2 +; GCN1-NEXT: s_cbranch_vccz .LBB47_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB47_3 -; GCN1-NEXT: s_branch .LBB47_4 -; GCN1-NEXT: .LBB47_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB47_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB47_6 +; GCN1-NEXT: .LBB47_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB47_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -5849,7 +6638,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB47_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5864,20 +6653,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB47_2 +; GCN2-NEXT: s_cbranch_vccz .LBB47_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB47_3 -; GCN2-NEXT: s_branch .LBB47_4 -; GCN2-NEXT: .LBB47_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB47_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB47_6 +; GCN2-NEXT: .LBB47_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB47_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -5891,7 +6698,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB47_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5904,20 +6711,33 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB47_2 +; GCN3-NEXT: s_cbranch_vccz .LBB47_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB47_3 -; GCN3-NEXT: s_branch .LBB47_4 -; GCN3-NEXT: .LBB47_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB47_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB47_6 +; GCN3-NEXT: .LBB47_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB47_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -5929,7 +6749,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB47_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -8126,21 +8946,40 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB60_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB60_4 +; GCN1-NEXT: s_cbranch_execnz .LBB60_6 ; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB60_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB60_2 -; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB60_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8166,21 +9005,40 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB60_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB60_4 +; GCN2-NEXT: s_cbranch_execnz .LBB60_6 ; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB60_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB60_2 -; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB60_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8204,21 +9062,37 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB60_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB60_4 +; GCN3-NEXT: s_cbranch_execnz .LBB60_6 ; GCN3-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB60_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB60_2 -; GCN3-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB60_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8249,21 +9123,40 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB61_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB61_4 +; GCN1-NEXT: s_cbranch_execnz .LBB61_6 ; GCN1-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB61_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB61_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB61_2 -; GCN1-NEXT: .LBB61_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB61_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8291,21 +9184,40 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB61_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB61_4 +; GCN2-NEXT: s_cbranch_execnz .LBB61_6 ; GCN2-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB61_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB61_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB61_2 -; GCN2-NEXT: .LBB61_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB61_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8331,21 +9243,37 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB61_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB61_4 +; GCN3-NEXT: s_cbranch_execnz .LBB61_6 ; GCN3-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB61_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB61_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB61_2 -; GCN3-NEXT: .LBB61_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB61_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8370,41 +9298,56 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB62_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB62_4 -; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB62_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB62_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB62_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB62_2 -; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB62_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB62_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -8413,41 +9356,56 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB62_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB62_4 -; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB62_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB62_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB62_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB62_2 -; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB62_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB62_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -8462,21 +9420,37 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB62_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: s_cbranch_execnz .LBB62_6 ; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB62_2 -; GCN3-NEXT: .LBB62_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB62_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8508,21 +9482,40 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB63_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: s_cbranch_execnz .LBB63_6 ; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB63_2 -; GCN1-NEXT: .LBB63_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB63_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -8551,21 +9544,40 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB63_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: s_cbranch_execnz .LBB63_6 ; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB63_2 -; GCN2-NEXT: .LBB63_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB63_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -8592,21 +9604,37 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB63_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: s_cbranch_execnz .LBB63_6 ; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB63_2 -; GCN3-NEXT: .LBB63_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB63_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8637,21 +9665,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB64_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB64_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB64_6 ; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB64_2 -; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB64_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB64_2 +; GCN1-NEXT: .LBB64_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -8680,21 +9726,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB64_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB64_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB64_6 ; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB64_2 -; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB64_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB64_2 +; GCN2-NEXT: .LBB64_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -8720,21 +9784,34 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB64_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB64_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB64_6 ; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB64_2 -; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB64_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB64_2 +; GCN3-NEXT: .LBB64_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -8766,21 +9843,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB65_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB65_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB65_6 ; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB65_2 -; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB65_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB65_2 +; GCN1-NEXT: .LBB65_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -8811,21 +9906,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB65_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB65_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB65_6 ; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB65_2 -; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB65_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB65_2 +; GCN2-NEXT: .LBB65_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -8853,21 +9966,34 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB65_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB65_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB65_6 ; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB65_2 -; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB65_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB65_2 +; GCN3-NEXT: .LBB65_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -8896,20 +10022,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB66_2 +; GCN1-NEXT: s_cbranch_vccz .LBB66_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB66_3 -; GCN1-NEXT: s_branch .LBB66_4 -; GCN1-NEXT: .LBB66_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB66_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB66_6 +; GCN1-NEXT: .LBB66_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB66_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB66_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -8924,7 +10068,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB66_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB66_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -8937,20 +10081,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB66_2 +; GCN2-NEXT: s_cbranch_vccz .LBB66_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB66_3 -; GCN2-NEXT: s_branch .LBB66_4 -; GCN2-NEXT: .LBB66_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB66_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB66_6 +; GCN2-NEXT: .LBB66_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB66_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB66_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -8964,7 +10126,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB66_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB66_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -8975,20 +10137,33 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB66_2 +; GCN3-NEXT: s_cbranch_vccz .LBB66_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB66_3 -; GCN3-NEXT: s_branch .LBB66_4 -; GCN3-NEXT: .LBB66_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB66_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB66_6 +; GCN3-NEXT: .LBB66_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB66_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB66_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -9000,7 +10175,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB66_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB66_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst @@ -9019,20 +10194,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB67_2 +; GCN1-NEXT: s_cbranch_vccz .LBB67_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB67_3 -; GCN1-NEXT: s_branch .LBB67_4 -; GCN1-NEXT: .LBB67_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB67_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB67_6 +; GCN1-NEXT: .LBB67_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB67_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -9047,7 +10240,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB67_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -9062,20 +10255,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB67_2 +; GCN2-NEXT: s_cbranch_vccz .LBB67_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB67_3 -; GCN2-NEXT: s_branch .LBB67_4 -; GCN2-NEXT: .LBB67_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB67_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB67_6 +; GCN2-NEXT: .LBB67_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB67_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -9089,7 +10300,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB67_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -9102,20 +10313,33 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB67_2 +; GCN3-NEXT: s_cbranch_vccz .LBB67_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB67_3 -; GCN3-NEXT: s_branch .LBB67_4 -; GCN3-NEXT: .LBB67_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB67_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB67_6 +; GCN3-NEXT: .LBB67_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB67_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -9127,7 +10351,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB67_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -9409,21 +10633,40 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB70_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB70_4 +; GCN1-NEXT: s_cbranch_execnz .LBB70_6 ; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB70_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB70_2 -; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB70_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9449,21 +10692,40 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB70_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB70_4 +; GCN2-NEXT: s_cbranch_execnz .LBB70_6 ; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB70_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB70_2 -; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB70_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9487,21 +10749,37 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB70_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB70_4 +; GCN3-NEXT: s_cbranch_execnz .LBB70_6 ; GCN3-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB70_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB70_2 -; GCN3-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB70_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9532,21 +10810,40 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB71_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB71_4 +; GCN1-NEXT: s_cbranch_execnz .LBB71_6 ; GCN1-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB71_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB71_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB71_2 -; GCN1-NEXT: .LBB71_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB71_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9574,21 +10871,40 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB71_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB71_4 +; GCN2-NEXT: s_cbranch_execnz .LBB71_6 ; GCN2-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB71_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB71_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB71_2 -; GCN2-NEXT: .LBB71_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB71_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9614,21 +10930,37 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB71_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB71_4 +; GCN3-NEXT: s_cbranch_execnz .LBB71_6 ; GCN3-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB71_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB71_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB71_2 -; GCN3-NEXT: .LBB71_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB71_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9653,41 +10985,56 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB72_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB72_4 -; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB72_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB72_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB72_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB72_2 -; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB72_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB72_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -9696,41 +11043,56 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB72_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB72_4 -; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB72_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB72_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB72_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB72_2 -; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB72_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB72_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -9745,21 +11107,37 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB72_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: s_cbranch_execnz .LBB72_6 ; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB72_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB72_2 -; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB72_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9791,21 +11169,40 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB73_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: s_cbranch_execnz .LBB73_6 ; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB73_2 -; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB73_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -9834,21 +11231,40 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB73_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: s_cbranch_execnz .LBB73_6 ; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB73_2 -; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB73_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -9875,21 +11291,37 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB73_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: s_cbranch_execnz .LBB73_6 ; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB73_2 -; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB73_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9920,21 +11352,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB74_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB74_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB74_6 ; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB74_2 -; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB74_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB74_2 +; GCN1-NEXT: .LBB74_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -9963,21 +11413,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB74_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB74_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB74_6 ; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB74_2 -; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB74_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB74_2 +; GCN2-NEXT: .LBB74_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -10003,21 +11471,34 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB74_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB74_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB74_6 ; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB74_2 -; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB74_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB74_2 +; GCN3-NEXT: .LBB74_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -10049,21 +11530,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB75_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB75_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB75_6 ; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB75_2 -; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB75_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB75_2 +; GCN1-NEXT: .LBB75_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -10094,21 +11593,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB75_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB75_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB75_6 ; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB75_2 -; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB75_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB75_2 +; GCN2-NEXT: .LBB75_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -10136,21 +11653,34 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB75_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB75_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB75_6 ; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB75_2 -; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB75_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB75_2 +; GCN3-NEXT: .LBB75_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -10179,20 +11709,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB76_2 +; GCN1-NEXT: s_cbranch_vccz .LBB76_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB76_3 -; GCN1-NEXT: s_branch .LBB76_4 -; GCN1-NEXT: .LBB76_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB76_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB76_6 +; GCN1-NEXT: .LBB76_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB76_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB76_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -10207,7 +11755,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB76_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB76_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -10220,20 +11768,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB76_2 +; GCN2-NEXT: s_cbranch_vccz .LBB76_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB76_3 -; GCN2-NEXT: s_branch .LBB76_4 -; GCN2-NEXT: .LBB76_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB76_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB76_6 +; GCN2-NEXT: .LBB76_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB76_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB76_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -10247,7 +11813,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB76_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB76_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -10257,21 +11823,34 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB76_2 -; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB76_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB76_3 -; GCN3-NEXT: s_branch .LBB76_4 -; GCN3-NEXT: .LBB76_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB76_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB76_6 +; GCN3-NEXT: .LBB76_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB76_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB76_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -10283,7 +11862,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB76_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB76_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst @@ -10302,20 +11881,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB77_2 +; GCN1-NEXT: s_cbranch_vccz .LBB77_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB77_3 -; GCN1-NEXT: s_branch .LBB77_4 -; GCN1-NEXT: .LBB77_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB77_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB77_6 +; GCN1-NEXT: .LBB77_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB77_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -10330,7 +11927,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB77_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -10345,20 +11942,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB77_2 +; GCN2-NEXT: s_cbranch_vccz .LBB77_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB77_3 -; GCN2-NEXT: s_branch .LBB77_4 -; GCN2-NEXT: .LBB77_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB77_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB77_6 +; GCN2-NEXT: .LBB77_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB77_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -10372,7 +11987,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB77_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -10385,20 +12000,33 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB77_2 +; GCN3-NEXT: s_cbranch_vccz .LBB77_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB77_3 -; GCN3-NEXT: s_branch .LBB77_4 -; GCN3-NEXT: .LBB77_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB77_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB77_6 +; GCN3-NEXT: .LBB77_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB77_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -10410,7 +12038,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB77_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -13030,40 +14658,21 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB92_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_6 +; GCN1-NEXT: s_cbranch_execnz .LBB92_4 ; GCN1-NEXT: .LBB92_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB92_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB92_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB92_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB92_2 -; GCN1-NEXT: .LBB92_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB92_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -13091,40 +14700,21 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB92_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_6 +; GCN2-NEXT: s_cbranch_execnz .LBB92_4 ; GCN2-NEXT: .LBB92_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB92_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB92_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB92_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB92_2 -; GCN2-NEXT: .LBB92_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB92_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -13150,37 +14740,21 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB92_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_6 +; GCN3-NEXT: s_cbranch_execnz .LBB92_4 ; GCN3-NEXT: .LBB92_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB92_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB92_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB92_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB92_2 -; GCN3-NEXT: .LBB92_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB92_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -13213,40 +14787,21 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB93_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_6 +; GCN1-NEXT: s_cbranch_execnz .LBB93_4 ; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB93_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB93_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB93_2 -; GCN1-NEXT: .LBB93_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB93_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -13275,40 +14830,21 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB93_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_6 +; GCN2-NEXT: s_cbranch_execnz .LBB93_4 ; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB93_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB93_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB93_2 -; GCN2-NEXT: .LBB93_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB93_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -13335,37 +14871,21 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB93_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_6 +; GCN3-NEXT: s_cbranch_execnz .LBB93_4 ; GCN3-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB93_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN3-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB93_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB93_2 -; GCN3-NEXT: .LBB93_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB93_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -15528,40 +17048,21 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB105_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_6 +; GCN1-NEXT: s_cbranch_execnz .LBB105_4 ; GCN1-NEXT: .LBB105_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB105_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB105_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB105_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB105_2 -; GCN1-NEXT: .LBB105_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB105_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -15589,40 +17090,21 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB105_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_6 +; GCN2-NEXT: s_cbranch_execnz .LBB105_4 ; GCN2-NEXT: .LBB105_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB105_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB105_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB105_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB105_2 -; GCN2-NEXT: .LBB105_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB105_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -15648,37 +17130,21 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB105_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_6 +; GCN3-NEXT: s_cbranch_execnz .LBB105_4 ; GCN3-NEXT: .LBB105_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB105_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB105_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB105_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB105_2 -; GCN3-NEXT: .LBB105_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB105_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -15711,40 +17177,21 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB106_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_6 +; GCN1-NEXT: s_cbranch_execnz .LBB106_4 ; GCN1-NEXT: .LBB106_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB106_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB106_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB106_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB106_2 -; GCN1-NEXT: .LBB106_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB106_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -15773,40 +17220,21 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB106_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_6 +; GCN2-NEXT: s_cbranch_execnz .LBB106_4 ; GCN2-NEXT: .LBB106_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB106_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB106_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB106_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB106_2 -; GCN2-NEXT: .LBB106_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB106_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -15833,37 +17261,21 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB106_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_6 +; GCN3-NEXT: s_cbranch_execnz .LBB106_4 ; GCN3-NEXT: .LBB106_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB106_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB106_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN3-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB106_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB106_2 -; GCN3-NEXT: .LBB106_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB106_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -17383,40 +18795,21 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB115_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_6 +; GCN1-NEXT: s_cbranch_execnz .LBB115_4 ; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB115_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB115_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB115_2 -; GCN1-NEXT: .LBB115_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -17444,40 +18837,21 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB115_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_6 +; GCN2-NEXT: s_cbranch_execnz .LBB115_4 ; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB115_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB115_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB115_2 -; GCN2-NEXT: .LBB115_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB115_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -17503,37 +18877,21 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB115_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_6 +; GCN3-NEXT: s_cbranch_execnz .LBB115_4 ; GCN3-NEXT: .LBB115_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB115_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB115_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB115_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB115_2 -; GCN3-NEXT: .LBB115_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB115_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -17566,40 +18924,21 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB116_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_6 +; GCN1-NEXT: s_cbranch_execnz .LBB116_4 ; GCN1-NEXT: .LBB116_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB116_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB116_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB116_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB116_2 -; GCN1-NEXT: .LBB116_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB116_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -17628,40 +18967,21 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB116_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_6 +; GCN2-NEXT: s_cbranch_execnz .LBB116_4 ; GCN2-NEXT: .LBB116_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB116_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB116_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB116_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB116_2 -; GCN2-NEXT: .LBB116_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB116_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -17688,37 +19008,21 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB116_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_6 +; GCN3-NEXT: s_cbranch_execnz .LBB116_4 ; GCN3-NEXT: .LBB116_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB116_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB116_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN3-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB116_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB116_2 -; GCN3-NEXT: .LBB116_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB116_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20077,40 +21381,21 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB129_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_6 +; GCN1-NEXT: s_cbranch_execnz .LBB129_4 ; GCN1-NEXT: .LBB129_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB129_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB129_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB129_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB129_2 -; GCN1-NEXT: .LBB129_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB129_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -20138,40 +21423,21 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB129_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_6 +; GCN2-NEXT: s_cbranch_execnz .LBB129_4 ; GCN2-NEXT: .LBB129_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB129_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB129_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB129_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB129_2 -; GCN2-NEXT: .LBB129_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB129_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -20197,37 +21463,21 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB129_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_6 +; GCN3-NEXT: s_cbranch_execnz .LBB129_4 ; GCN3-NEXT: .LBB129_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB129_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB129_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB129_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB129_2 -; GCN3-NEXT: .LBB129_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB129_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20260,40 +21510,21 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB130_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_6 +; GCN1-NEXT: s_cbranch_execnz .LBB130_4 ; GCN1-NEXT: .LBB130_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB130_3: ; %atomicrmw.global -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB130_4: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB130_4 -; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB130_2 -; GCN1-NEXT: .LBB130_6: ; %atomicrmw.private +; GCN1-NEXT: .LBB130_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -20322,40 +21553,21 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB130_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_6 +; GCN2-NEXT: s_cbranch_execnz .LBB130_4 ; GCN2-NEXT: .LBB130_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB130_3: ; %atomicrmw.global -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB130_4: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB130_4 -; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB130_2 -; GCN2-NEXT: .LBB130_6: ; %atomicrmw.private +; GCN2-NEXT: .LBB130_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -20382,37 +21594,21 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB130_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_6 +; GCN3-NEXT: s_cbranch_execnz .LBB130_4 ; GCN3-NEXT: .LBB130_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB130_3: ; %atomicrmw.global -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB130_4: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN3-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB130_4 -; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB130_2 -; GCN3-NEXT: .LBB130_6: ; %atomicrmw.private +; GCN3-NEXT: .LBB130_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20446,21 +21642,42 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB131_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB131_4 +; GCN1-NEXT: s_cbranch_execnz .LBB131_6 ; GCN1-NEXT: .LBB131_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB131_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB131_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB131_2 -; GCN1-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB131_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20489,21 +21706,42 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB131_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB131_4 +; GCN2-NEXT: s_cbranch_execnz .LBB131_6 ; GCN2-NEXT: .LBB131_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB131_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB131_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB131_2 -; GCN2-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB131_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20530,21 +21768,39 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB131_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB131_4 +; GCN3-NEXT: s_cbranch_execnz .LBB131_6 ; GCN3-NEXT: .LBB131_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB131_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB131_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB131_2 -; GCN3-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB131_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20578,21 +21834,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB132_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB132_4 +; GCN1-NEXT: s_cbranch_execnz .LBB132_6 ; GCN1-NEXT: .LBB132_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB132_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB132_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB132_2 -; GCN1-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB132_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20623,21 +21900,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB132_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB132_4 +; GCN2-NEXT: s_cbranch_execnz .LBB132_6 ; GCN2-NEXT: .LBB132_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB132_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB132_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB132_2 -; GCN2-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB132_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20666,21 +21964,39 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB132_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB132_4 +; GCN3-NEXT: s_cbranch_execnz .LBB132_6 ; GCN3-NEXT: .LBB132_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB132_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB132_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB132_2 -; GCN3-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB132_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20708,44 +22024,61 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB133_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB133_4 -; GCN1-NEXT: .LBB133_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB133_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB133_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB133_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB133_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: .LBB133_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB133_2 -; GCN1-NEXT: .LBB133_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB133_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc -; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB133_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -20754,44 +22087,61 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB133_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB133_4 -; GCN2-NEXT: .LBB133_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB133_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB133_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB133_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB133_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: .LBB133_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB133_2 -; GCN2-NEXT: .LBB133_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB133_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc -; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB133_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -20806,21 +22156,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB133_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB133_4 +; GCN3-NEXT: s_cbranch_execnz .LBB133_6 ; GCN3-NEXT: .LBB133_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB133_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB133_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB133_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB133_2 -; GCN3-NEXT: .LBB133_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB133_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20855,21 +22223,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB134_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB134_4 +; GCN1-NEXT: s_cbranch_execnz .LBB134_6 ; GCN1-NEXT: .LBB134_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB134_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB134_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB134_2 -; GCN1-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB134_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20901,21 +22290,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB134_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB134_4 +; GCN2-NEXT: s_cbranch_execnz .LBB134_6 ; GCN2-NEXT: .LBB134_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB134_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB134_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB134_2 -; GCN2-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB134_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20945,21 +22355,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB134_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB134_4 +; GCN3-NEXT: s_cbranch_execnz .LBB134_6 ; GCN3-NEXT: .LBB134_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB134_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB134_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB134_2 -; GCN3-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB134_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20993,21 +22421,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB135_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB135_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB135_6 ; GCN1-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB135_2 -; GCN1-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB135_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB135_2 +; GCN1-NEXT: .LBB135_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -21039,21 +22488,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB135_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB135_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB135_6 ; GCN2-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB135_2 -; GCN2-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB135_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB135_2 +; GCN2-NEXT: .LBB135_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21082,21 +22552,37 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB135_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB135_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB135_6 ; GCN3-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB135_2 -; GCN3-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB135_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB135_2 +; GCN3-NEXT: .LBB135_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21131,21 +22617,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB136_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB136_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB136_6 ; GCN1-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB136_2 -; GCN1-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB136_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB136_2 +; GCN1-NEXT: .LBB136_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -21179,21 +22686,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB136_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB136_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB136_6 ; GCN2-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB136_2 -; GCN2-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB136_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB136_2 +; GCN2-NEXT: .LBB136_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21224,21 +22752,37 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB136_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB136_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB136_6 ; GCN3-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB136_2 -; GCN3-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB136_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB136_2 +; GCN3-NEXT: .LBB136_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21270,20 +22814,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB137_2 +; GCN1-NEXT: s_cbranch_vccz .LBB137_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB137_3 -; GCN1-NEXT: s_branch .LBB137_4 -; GCN1-NEXT: .LBB137_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB137_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB137_6 +; GCN1-NEXT: .LBB137_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB137_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -21301,7 +22866,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB137_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -21314,20 +22879,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB137_2 +; GCN2-NEXT: s_cbranch_vccz .LBB137_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB137_3 -; GCN2-NEXT: s_branch .LBB137_4 -; GCN2-NEXT: .LBB137_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB137_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB137_6 +; GCN2-NEXT: .LBB137_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB137_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21344,7 +22930,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB137_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -21355,20 +22941,36 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB137_2 +; GCN3-NEXT: s_cbranch_vccz .LBB137_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB137_3 -; GCN3-NEXT: s_branch .LBB137_4 -; GCN3-NEXT: .LBB137_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB137_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB137_6 +; GCN3-NEXT: .LBB137_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB137_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21383,7 +22985,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB137_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst @@ -21402,20 +23004,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB138_2 +; GCN1-NEXT: s_cbranch_vccz .LBB138_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB138_3 -; GCN1-NEXT: s_branch .LBB138_4 -; GCN1-NEXT: .LBB138_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB138_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB138_6 +; GCN1-NEXT: .LBB138_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB138_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -21433,7 +23056,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB138_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -21448,20 +23071,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB138_2 +; GCN2-NEXT: s_cbranch_vccz .LBB138_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB138_3 -; GCN2-NEXT: s_branch .LBB138_4 -; GCN2-NEXT: .LBB138_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB138_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB138_6 +; GCN2-NEXT: .LBB138_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB138_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21478,7 +23122,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB138_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -21491,20 +23135,36 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB138_2 +; GCN3-NEXT: s_cbranch_vccz .LBB138_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB138_3 -; GCN3-NEXT: s_branch .LBB138_4 -; GCN3-NEXT: .LBB138_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB138_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB138_6 +; GCN3-NEXT: .LBB138_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB138_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21519,7 +23179,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB138_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -21817,23 +23477,46 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB141_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB141_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB141_6 ; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB141_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB141_2 -; GCN1-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -21861,23 +23544,46 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB141_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB141_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB141_6 ; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB141_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB141_2 -; GCN2-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -21903,23 +23609,43 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB141_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB141_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB141_6 ; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB141_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB141_2 -; GCN3-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21952,23 +23678,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB142_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB142_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB142_6 ; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB142_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB142_2 -; GCN1-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -21998,23 +23747,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB142_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB142_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB142_6 ; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB142_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB142_2 -; GCN2-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -22042,23 +23814,43 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB142_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB142_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB142_6 ; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB142_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB142_2 -; GCN3-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -22087,46 +23879,65 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB143_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB143_4 -; GCN1-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB143_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB143_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB143_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB143_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB143_2 -; GCN1-NEXT: .LBB143_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: .LBB143_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execz .LBB143_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v4 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7] ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB143_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22135,46 +23946,65 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB143_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB143_4 -; GCN2-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB143_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB143_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB143_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB143_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB143_2 -; GCN2-NEXT: .LBB143_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: .LBB143_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execz .LBB143_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v4 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7] ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB143_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22187,23 +24017,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB143_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB143_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB143_6 ; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB143_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB143_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB143_2 -; GCN3-NEXT: .LBB143_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB143_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -22238,23 +24088,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB144_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB144_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB144_6 ; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB144_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB144_2 -; GCN1-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -22286,23 +24159,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB144_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB144_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB144_6 ; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB144_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB144_2 -; GCN2-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -22332,23 +24228,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB144_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB144_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB144_6 ; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB144_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB144_2 -; GCN3-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -22384,21 +24300,46 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB145_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN1-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB145_2 -; GCN1-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB145_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_branch .LBB145_2 +; GCN1-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -22433,21 +24374,46 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB145_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN2-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB145_2 -; GCN2-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB145_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_branch .LBB145_2 +; GCN2-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -22479,21 +24445,41 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB145_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB145_2 -; GCN3-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB145_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_branch .LBB145_2 +; GCN3-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -22522,34 +24508,59 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s34, s4, 32 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s38, s4, 32 +; GCN1-NEXT: s_addc_u32 s39, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s35, s36 -; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB146_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s34, s38, 4 +; GCN1-NEXT: s_addc_u32 s35, s39, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s38 +; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB146_2 -; GCN1-NEXT: .LBB146_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_cbranch_execnz .LBB146_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_branch .LBB146_2 +; GCN1-NEXT: .LBB146_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec -; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s38, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s34 @@ -22573,32 +24584,57 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s34, s4, 32 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s38, s4, 32 +; GCN2-NEXT: s_addc_u32 s39, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s35, s36 -; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB146_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s38, 4 +; GCN2-NEXT: s_addc_u32 s35, s39, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s38 +; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB146_2 -; GCN2-NEXT: .LBB146_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_cbranch_execnz .LBB146_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_branch .LBB146_2 +; GCN2-NEXT: .LBB146_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN2-NEXT: s_cselect_b32 s34, s38, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s34 @@ -22622,31 +24658,51 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 -; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base -; GCN3-NEXT: s_cmp_eq_u32 s35, s37 -; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_add_u32 s38, s4, 32 +; GCN3-NEXT: s_addc_u32 s39, s5, 0 +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB146_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s38 +; GCN3-NEXT: v_mov_b32_e32 v5, s39 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[40:41], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB146_2 -; GCN3-NEXT: .LBB146_4: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_cbranch_execnz .LBB146_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_branch .LBB146_2 +; GCN3-NEXT: .LBB146_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN3-NEXT: s_cselect_b32 s34, s38, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 @@ -22679,20 +24735,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB147_2 +; GCN1-NEXT: s_cbranch_vccz .LBB147_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB147_3 -; GCN1-NEXT: s_branch .LBB147_4 -; GCN1-NEXT: .LBB147_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB147_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_branch .LBB147_6 +; GCN1-NEXT: .LBB147_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB147_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -22714,7 +24795,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22727,20 +24808,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB147_2 +; GCN2-NEXT: s_cbranch_vccz .LBB147_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB147_3 -; GCN2-NEXT: s_branch .LBB147_4 -; GCN2-NEXT: .LBB147_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB147_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_branch .LBB147_6 +; GCN2-NEXT: .LBB147_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB147_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -22761,7 +24867,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22772,20 +24878,40 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB147_2 +; GCN3-NEXT: s_cbranch_vccz .LBB147_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB147_3 -; GCN3-NEXT: s_branch .LBB147_4 -; GCN3-NEXT: .LBB147_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB147_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_branch .LBB147_6 +; GCN3-NEXT: .LBB147_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB147_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -22804,7 +24930,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -22816,31 +24942,56 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s34, s4, 32 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s38, s4, 32 +; GCN1-NEXT: s_addc_u32 s39, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s35, s36 -; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB148_2 +; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB148_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s34, s38, 4 +; GCN1-NEXT: s_addc_u32 s35, s39, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v2, s38 +; GCN1-NEXT: v_mov_b32_e32 v3, s39 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB148_3 -; GCN1-NEXT: s_branch .LBB148_4 -; GCN1-NEXT: .LBB148_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_cbranch_execnz .LBB148_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_branch .LBB148_6 +; GCN1-NEXT: .LBB148_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB148_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_cbranch_execz .LBB148_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec -; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s38, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s34 @@ -22858,7 +25009,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22866,29 +25017,54 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s34, s4, 32 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s38, s4, 32 +; GCN2-NEXT: s_addc_u32 s39, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s35, s36 -; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB148_2 +; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB148_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s38, 4 +; GCN2-NEXT: s_addc_u32 s35, s39, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v2, s38 +; GCN2-NEXT: v_mov_b32_e32 v3, s39 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB148_3 -; GCN2-NEXT: s_branch .LBB148_4 -; GCN2-NEXT: .LBB148_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_cbranch_execnz .LBB148_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_branch .LBB148_6 +; GCN2-NEXT: .LBB148_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB148_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: s_cbranch_execz .LBB148_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN2-NEXT: s_cselect_b32 s34, s38, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s34 @@ -22907,35 +25083,55 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 -; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base -; GCN3-NEXT: s_cmp_eq_u32 s35, s37 -; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB148_2 +; GCN3-NEXT: s_add_u32 s38, s4, 32 +; GCN3-NEXT: s_addc_u32 s39, s5, 0 +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB148_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s38 +; GCN3-NEXT: v_mov_b32_e32 v3, s39 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[40:41], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB148_3 -; GCN3-NEXT: s_branch .LBB148_4 -; GCN3-NEXT: .LBB148_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_cbranch_execnz .LBB148_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_branch .LBB148_6 +; GCN3-NEXT: .LBB148_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB148_3: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: s_cbranch_execz .LBB148_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN3-NEXT: s_cselect_b32 s34, s38, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 @@ -22952,7 +25148,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index fe47461ebf956..4dea4495b36fb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -1097,25 +1097,76 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1125,29 +1176,80 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1158,25 +1260,82 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1186,29 +1345,82 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1219,41 +1431,99 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 - ret void -} +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { ; GFX7-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1261,13 +1531,32 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1275,25 +1564,60 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1304,37 +1628,95 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v1 +; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1346,13 +1728,32 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v1 +; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar: @@ -1360,25 +1761,60 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1459,25 +1895,76 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1487,29 +1974,80 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1520,25 +2058,82 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1548,29 +2143,82 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1581,37 +2229,92 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1623,13 +2326,31 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar: @@ -1637,25 +2358,58 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1666,37 +2420,92 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1708,13 +2517,31 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar: @@ -1722,25 +2549,58 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2771,25 +3631,76 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -2799,29 +3710,80 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2832,25 +3794,82 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -2860,29 +3879,82 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2893,37 +3965,92 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX7-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB64_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB64_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -2935,13 +4062,31 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB65_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar: @@ -2949,25 +4094,58 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB65_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2978,37 +4156,92 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX7-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB66_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB66_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -3020,13 +4253,31 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB67_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar: @@ -3034,25 +4285,58 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB67_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3133,25 +4417,76 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB70_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB70_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -3161,29 +4496,80 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB71_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB71_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3194,25 +4580,82 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB72_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB72_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -3222,29 +4665,82 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_xor_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB73_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_xor_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB73_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3255,37 +4751,92 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -3297,13 +4848,31 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar: @@ -3311,25 +4880,58 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3340,37 +4942,92 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -3382,13 +5039,31 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar: @@ -3396,25 +5071,58 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -4697,83 +6405,29 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX7-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB92_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB92_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB92_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB92_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -4784,85 +6438,29 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX7-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[0:1] -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB93_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB93_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v1, v[0:1] -; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB93_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB93_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -5977,83 +7575,29 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX7-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB105_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB105_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB105_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB105_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -6064,85 +7608,29 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX7-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[0:1] -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB106_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB106_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v1, v[0:1] -; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB106_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB106_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -6921,83 +8409,29 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX7-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB115_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB115_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB115_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -7008,85 +8442,29 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX7-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[0:1] -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB116_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v1, v[0:1] -; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB116_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB116_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -8296,83 +9674,29 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX7-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB129_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB129_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB129_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB129_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB129_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -8383,85 +9707,29 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX7-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v1, v[0:1] -; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB130_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB130_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v1, v[0:1] -; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB130_1: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB130_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB130_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -8476,25 +9744,85 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB131_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB131_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -8504,29 +9832,89 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB132_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB132_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8537,25 +9925,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB133_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB133_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB133_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -8565,29 +10019,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB134_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB134_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB134_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8598,37 +10114,101 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB135_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB135_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB135_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -8640,13 +10220,34 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB136_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -8654,25 +10255,64 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB136_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8683,37 +10323,101 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB137_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB137_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -8725,13 +10429,34 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB138_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -8739,25 +10464,64 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB138_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB138_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8838,25 +10602,91 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB141_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB141_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB141_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -8866,29 +10696,95 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB142_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB142_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB142_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8899,25 +10795,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB143_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB143_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB143_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -8927,29 +10895,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB144_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB144_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB144_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8960,37 +10996,113 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB145_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB145_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB145_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -9002,13 +11114,38 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB146_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: @@ -9016,25 +11153,72 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB146_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB146_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -9045,37 +11229,113 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB147_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB147_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB147_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -9087,13 +11347,38 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB148_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: @@ -9101,25 +11386,72 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB148_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB148_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir index d00fd9b967f37..74c4a2da50221 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -43,8 +43,7 @@ body: | ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1 - ; GCN-NEXT: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 0, [[DEF1]], implicit $exec - ; GCN-NEXT: [[V_XOR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[DEF2]], [[REG_SEQUENCE]].sub0, implicit $exec + ; GCN-NEXT: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[DEF2]], [[REG_SEQUENCE]].sub0, implicit $exec %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index 7fad2f466bc9f..a88b1ecc40cc9 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -75,7 +75,8 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0 - ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir index cc4314263bcba..2f2d727ee2c59 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir @@ -46,7 +46,8 @@ body: | %2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc ... # GCN-LABEL: name: test_frameindex{{$}} -# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70 +# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]] --- name: test_frameindex tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir index b1aa88969c5bb..dc03eb74cbf11 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir @@ -8,8 +8,8 @@ body: | ; CHECK-LABEL: name: test_tryFoldZeroHighBits_skips_nonreg ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 - ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, 0, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit [[V_AND_B32_e64_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]] %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 %2:vgpr_32 = V_AND_B32_e64 65535, %1.sub0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index edf699aac2a7b..84ccfb9ff6991 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -354,4 +354,186 @@ entry: ret void } +; Check for "SOP2/SOPC instruction requires too many immediate +; constants" verifier error. Frame index would fold into low half of +; the lowered flat pointer add, and use s_add_u32 instead of +; s_add_i32. + +; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error: +; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000 +; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]] +; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0 +define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 { +entry: + %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) + %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) + %Total3.ascast.i.i = addrspacecast ptr addrspace(5) %Total3.i.i to ptr + %gep = getelementptr i8, ptr %Total3.ascast.i.i, i64 4096 + %p2i = ptrtoint ptr %gep to i64 + br label %.shuffle.then.i.i.i.i + +.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry + store i64 0, ptr addrspace(5) null, align 4 + %icmp = icmp ugt i64 %p2i, 1 + br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i + +vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i + %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4 + store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4 + ret void +} + +; GCN-LABEL: {{^}}fi_sop2_and_literal_error: +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1fe00 +define amdgpu_kernel void @fi_sop2_and_literal_error() #0 { +entry: + %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) + %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) + %p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32 + br label %.shuffle.then.i.i.i.i + +.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry + store i64 0, ptr addrspace(5) null, align 4 + %or = and i32 %p2i, -512 + %icmp = icmp ugt i32 %or, 9999999 + br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i + +vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i + %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4 + store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4 + ret void +} + +; GCN-LABEL: {{^}}fi_sop2_or_literal_error: +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +define amdgpu_kernel void @fi_sop2_or_literal_error() #0 { +entry: + %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) + %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) + %p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32 + br label %.shuffle.then.i.i.i.i + +.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry + store i64 0, ptr addrspace(5) null, align 4 + %or = or i32 %p2i, 12345 + %icmp = icmp ugt i32 %or, 9999999 + br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i + +vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i + %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4 + store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4 + ret void +} + +; Check that we do not produce a verifier error after prolog +; epilog. alloca1 and alloca2 will lower to literals. + +; GCN-LABEL: {{^}}s_multiple_frame_indexes_literal_offsets: +; GCN: s_load_dword [[ARG0:s[0-9]+]] +; GCN: s_movk_i32 [[ALLOCA1:s[0-9]+]], 0x44 +; GCN: s_cmp_eq_u32 [[ARG0]], 0 +; GCN: s_cselect_b32 [[SELECT:s[0-9]+]], [[ALLOCA1]], 0x48 +; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0 +; GCN: ; use [[SELECT]], [[ALLOCA0]] +define amdgpu_kernel void @s_multiple_frame_indexes_literal_offsets(i32 inreg %arg0) #0 { + %alloca0 = alloca [17 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %alloca2 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %arg0, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca2 + call void asm sideeffect "; use $0, $1","s,s"(ptr addrspace(5) %select, ptr addrspace(5) %alloca0) + ret void +} + +; %alloca1 or alloca2 will lower to an inline constant, and one will +; be a literal, so we could fold both indexes into the instruction. + +; GCN-LABEL: {{^}}s_multiple_frame_indexes_one_imm_one_literal_offset: +; GCN: s_load_dword [[ARG0:s[0-9]+]] +; GCN: s_mov_b32 [[ALLOCA1:s[0-9]+]], 64 +; GCN: s_cmp_eq_u32 [[ARG0]], 0 +; GCN: s_cselect_b32 [[SELECT:s[0-9]+]], [[ALLOCA1]], 0x44 +; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0 +; GCN: ; use [[SELECT]], [[ALLOCA0]] +define amdgpu_kernel void @s_multiple_frame_indexes_one_imm_one_literal_offset(i32 inreg %arg0) #0 { + %alloca0 = alloca [16 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %alloca2 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %arg0, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca2 + call void asm sideeffect "; use $0, $1","s,s"(ptr addrspace(5) %select, ptr addrspace(5) %alloca0) + ret void +} + +; GCN-LABEL: {{^}}s_multiple_frame_indexes_imm_offsets: +; GCN: s_load_dword [[ARG0:s[0-9]+]] +; GCN: s_mov_b32 [[ALLOCA1:s[0-9]+]], 16 +; GCN: s_cmp_eq_u32 [[ARG0]], 0 +; GCN: s_cselect_b32 [[SELECT:s[0-9]+]], [[ALLOCA1]], 20 +; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0 +; GCN: ; use [[SELECT]], [[ALLOCA0]] +define amdgpu_kernel void @s_multiple_frame_indexes_imm_offsets(i32 inreg %arg0) #0 { + %alloca0 = alloca [4 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %alloca2 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %arg0, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca2 + call void asm sideeffect "; use $0, $1","s,s"(ptr addrspace(5) %select, ptr addrspace(5) %alloca0) + ret void +} + +; GCN-LABEL: {{^}}v_multiple_frame_indexes_literal_offsets: +; GCN: v_mov_b32_e32 [[ALLOCA1:v[0-9]+]], 0x48 +; GCN: v_mov_b32_e32 [[ALLOCA2:v[0-9]+]], 0x44 +; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[ALLOCA1]], [[ALLOCA2]], vcc +; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}} +; GCN: ; use [[SELECT]], [[ALLOCA0]] +define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 { + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %alloca0 = alloca [17 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %alloca2 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %vgpr, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca2 + call void asm sideeffect "; use $0, $1","v,v"(ptr addrspace(5) %select, ptr addrspace(5) %alloca0) + ret void +} + +; GCN-LABEL: {{^}}v_multiple_frame_indexes_one_imm_one_literal_offset: +; GCN: v_mov_b32_e32 [[ALLOCA1:v[0-9]+]], 0x44 +; GCN: v_mov_b32_e32 [[ALLOCA2:v[0-9]+]], 64 +; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[ALLOCA1]], [[ALLOCA2]], vcc +; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}} +; GCN: ; use [[SELECT]], [[ALLOCA0]] +define amdgpu_kernel void @v_multiple_frame_indexes_one_imm_one_literal_offset() #0 { + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %alloca0 = alloca [16 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %alloca2 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %vgpr, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca2 + call void asm sideeffect "; use $0, $1","v,v"(ptr addrspace(5) %select, ptr addrspace(5) %alloca0) + ret void +} + +; GCN-LABEL: {{^}}v_multiple_frame_indexes_imm_offsets: +; GCN: v_mov_b32_e32 [[ALLOCA1:v[0-9]+]], 12 +; GCN: v_mov_b32_e32 [[ALLOCA2:v[0-9]+]], 8 +; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[ALLOCA1]], [[ALLOCA2]], vcc +; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}} +; GCN: ; use [[SELECT]], [[ALLOCA0]] +define amdgpu_kernel void @v_multiple_frame_indexes_imm_offsets() #0 { + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %alloca0 = alloca [2 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %alloca2 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %vgpr, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca2 + call void asm sideeffect "; use $0, $1","v,v"(ptr addrspace(5) %select, ptr addrspace(5) %alloca0) + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index 1def479be283a..d297955f109ab 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -985,7 +985,7 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1024,7 +1024,7 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1061,7 +1061,7 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1098,7 +1098,7 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1135,7 +1135,7 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -1174,7 +1174,7 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -1211,7 +1211,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1248,7 +1248,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1289,7 +1289,7 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1328,7 +1328,7 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1365,7 +1365,7 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1402,7 +1402,7 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1439,7 +1439,7 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -1478,7 +1478,7 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -1515,7 +1515,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1552,7 +1552,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1593,7 +1593,7 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1632,7 +1632,7 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1669,7 +1669,7 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1706,7 +1706,7 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1743,7 +1743,7 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -1782,7 +1782,7 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -1819,7 +1819,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1856,7 +1856,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1897,7 +1897,7 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1936,7 +1936,7 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -1973,7 +1973,7 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2010,7 +2010,7 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst + %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2047,7 +2047,7 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2086,7 +2086,7 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2123,7 +2123,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2160,7 +2160,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst + %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2198,7 +2198,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -2234,7 +2234,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -2267,7 +2267,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2300,7 +2300,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2334,7 +2334,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2370,7 +2370,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2403,7 +2403,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2436,7 +2436,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2474,7 +2474,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -2510,7 +2510,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -2543,7 +2543,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2576,7 +2576,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2610,7 +2610,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2646,7 +2646,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2679,7 +2679,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2712,7 +2712,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2750,7 +2750,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -2786,7 +2786,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -2819,7 +2819,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2852,7 +2852,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2886,7 +2886,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2922,7 +2922,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -2955,7 +2955,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2988,7 +2988,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3026,7 +3026,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3062,7 +3062,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3095,7 +3095,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3128,7 +3128,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3162,7 +3162,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3198,7 +3198,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3231,7 +3231,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3264,7 +3264,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst + %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3664,7 +3664,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3690,7 +3690,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3712,7 +3712,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3734,7 +3734,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3758,7 +3758,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3784,7 +3784,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3806,7 +3806,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3828,7 +3828,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3857,7 +3857,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3883,7 +3883,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3905,7 +3905,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3927,7 +3927,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3951,7 +3951,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3977,7 +3977,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) i %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3999,7 +3999,7 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4021,8 +4021,10 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } attributes #0 = { argmemonly nounwind willreturn } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 0512b9bc2b54a..10b64c3ca4dae 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -824,7 +824,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -893,7 +893,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -971,7 +971,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1060,7 +1060,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1551,7 +1551,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1640,7 +1640,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1840,7 +1840,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1926,7 +1926,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -1984,7 +1984,7 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2049,7 +2049,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2120,7 +2120,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2255,7 +2255,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2319,7 +2319,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2387,7 +2387,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2469,7 +2469,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2520,7 +2520,7 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2656,7 +2656,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2741,7 +2741,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2791,7 +2791,7 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2855,7 +2855,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -2923,7 +2923,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3005,7 +3005,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3056,7 +3056,7 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3121,7 +3121,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3192,7 +3192,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3277,7 +3277,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3327,7 +3327,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3391,7 +3391,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3459,7 +3459,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3541,7 +3541,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3592,7 +3592,7 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3657,7 +3657,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3728,7 +3728,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3813,7 +3813,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3863,7 +3863,7 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3927,7 +3927,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -3995,7 +3995,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4077,7 +4077,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst + %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -4135,7 +4135,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4204,7 +4204,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -4282,7 +4282,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4371,7 +4371,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -4428,7 +4428,7 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4496,7 +4496,7 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -4571,7 +4571,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4657,7 +4657,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -4715,7 +4715,7 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5978,7 +5978,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -6125,7 +6125,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -6214,7 +6214,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -6271,7 +6271,7 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -6339,7 +6339,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm entry: - %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -6414,7 +6414,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -6500,7 +6500,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index - %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -8233,7 +8233,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8294,7 +8294,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8353,7 +8353,7 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8418,7 +8418,7 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8487,7 +8487,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -8565,7 +8565,7 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8654,7 +8654,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -8712,7 +8712,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8773,7 +8773,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8832,7 +8832,7 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8897,7 +8897,7 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8966,7 +8966,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -9044,7 +9044,7 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -9133,7 +9133,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 - %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst + %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %out2 ret void } @@ -9437,3 +9437,243 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr store bfloat %val, ptr addrspace(1) %out ret void } + +define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i16 %in) { +; SI-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_u32 s6, s0, 0x4650 +; SI-NEXT: s_addc_u32 s1, s1, 0 +; SI-NEXT: s_and_b32 s0, s6, -4 +; SI-NEXT: s_and_b32 s6, s6, 3 +; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_load_dword s9, s[0:1], 0x0 +; SI-NEXT: s_lshl_b32 s7, s6, 3 +; SI-NEXT: s_lshl_b32 s6, 0xffff, s7 +; SI-NEXT: s_lshl_b32 s7, s2, s7 +; SI-NEXT: s_not_b32 s8, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, s6, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, s8, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB136_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_endpgm +; +; VI-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s3, s0, 0x4650 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_and_b32 s0, s3, -4 +; VI-NEXT: s_load_dword s9, s[0:1], 0x0 +; VI-NEXT: s_and_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s6, 0xffff, s3 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_not_b32 s7, s6 +; VI-NEXT: s_lshl_b32 s8, s2, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s8, v1 +; VI-NEXT: v_and_b32_e32 v2, s7, v1 +; VI-NEXT: v_and_b32_e32 v0, s6, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB136_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s3, s0, 0x4650 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_and_b32 s0, s3, -4 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 +; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: s_lshl_b32 s3, s3, 3 +; GFX9-NEXT: s_lshl_b32 s4, 0xffff, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_not_b32 s5, s4 +; GFX9-NEXT: s_lshl_b32 s6, s2, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm + %gep = getelementptr i16, ptr addrspace(1) %out, i64 9000 + %val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i8 %in) { +; SI-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_u32 s6, s0, 0x2328 +; SI-NEXT: s_addc_u32 s1, s1, 0 +; SI-NEXT: s_and_b32 s0, s6, -4 +; SI-NEXT: s_and_b32 s6, s6, 3 +; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_load_dword s9, s[0:1], 0x0 +; SI-NEXT: s_lshl_b32 s7, s6, 3 +; SI-NEXT: s_lshl_b32 s6, 0xff, s7 +; SI-NEXT: s_lshl_b32 s7, s2, s7 +; SI-NEXT: s_not_b32 s8, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, s6, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, s8, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB137_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_endpgm +; +; VI-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s3, s0, 0x2328 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_and_b32 s0, s3, -4 +; VI-NEXT: s_load_dword s9, s[0:1], 0x0 +; VI-NEXT: s_and_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s6, 0xff, s3 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_not_b32 s7, s6 +; VI-NEXT: s_lshl_b32 s8, s2, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s8, v1 +; VI-NEXT: v_and_b32_e32 v2, s7, v1 +; VI-NEXT: v_and_b32_e32 v0, s6, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB137_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s3, s0, 0x2328 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_and_b32 s0, s3, -4 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 +; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: s_lshl_b32 s3, s3, 3 +; GFX9-NEXT: s_lshl_b32 s4, 0xff, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_not_b32 s5, s4 +; GFX9-NEXT: s_lshl_b32 s6, s2, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm + %gep = getelementptr i8, ptr addrspace(1) %out, i64 9000 + %val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index f7882e6f12022..704f57028188a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1284,26 +1284,68 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1317,9 +1359,25 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1328,17 +1386,43 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1353,27 +1437,71 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1387,29 +1515,72 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1421,23 +1592,37 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1447,20 +1632,44 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1471,23 +1680,37 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1499,20 +1722,44 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1524,23 +1771,37 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1550,20 +1811,46 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1574,23 +1861,37 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1600,22 +1901,46 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1744,26 +2069,68 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1777,9 +2144,25 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1788,17 +2171,43 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -1813,27 +2222,71 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1847,29 +2300,72 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -1881,23 +2377,37 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 +; SI-NEXT: v_and_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1907,20 +2417,44 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1931,23 +2465,37 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_and_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1959,20 +2507,44 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -1984,23 +2556,37 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2010,20 +2596,46 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_and_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2034,23 +2646,37 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2060,22 +2686,46 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_and_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -3030,26 +3680,68 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3063,9 +3755,25 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3074,17 +3782,43 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3099,27 +3833,71 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3133,29 +3911,72 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB64_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB64_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3167,23 +3988,37 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 +; SI-NEXT: v_or_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB65_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3193,20 +4028,44 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3217,23 +4076,37 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_or_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB66_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3245,20 +4118,44 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3270,23 +4167,37 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3296,20 +4207,46 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_or_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3320,23 +4257,37 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3346,22 +4297,46 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_or_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3490,26 +4465,68 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3523,9 +4540,25 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3534,17 +4567,43 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3559,27 +4618,71 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3593,29 +4696,72 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_xor_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3627,23 +4773,37 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 +; SI-NEXT: v_xor_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3653,20 +4813,44 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3677,23 +4861,37 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_xor_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3705,20 +4903,44 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3730,23 +4952,37 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3756,20 +4992,46 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_xor_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3780,23 +5042,37 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3806,22 +5082,46 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_xor_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -5041,25 +6341,9 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB95_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5068,43 +6352,17 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB95_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v3, v4, v2 -; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_smax v[0:1], v2, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5119,72 +6377,29 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB96_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_max_i32_e32 v4, v5, v2 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[3:4] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB96_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_max_i32_e32 v0, v1, v2 -; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_smax v0, v[0:1], v2, off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6199,25 +7414,9 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB108_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB108_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6226,43 +7425,17 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB108_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v3, v4, v2 -; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB108_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_umax v[0:1], v2, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6277,72 +7450,29 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB109_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_max_u32_e32 v4, v5, v2 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB109_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[3:4] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB109_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_max_u32_e32 v0, v1, v2 -; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB109_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_umax v0, v[0:1], v2, off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7032,25 +8162,9 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB118_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB118_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7058,44 +8172,18 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB118_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v3, v4, v2 -; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB118_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_umin v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_umin v[0:1], v2, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB118_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7110,72 +8198,29 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB119_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_min_u32_e32 v4, v5, v2 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB119_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[3:4] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB119_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_min_u32_e32 v0, v1, v2 -; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB119_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_umin v0, v[0:1], v2, off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB119_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8272,25 +9317,9 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB132_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB132_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8299,43 +9328,17 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB132_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v3, v4, v2 -; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB132_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_smin v[0:1], v2, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB132_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8350,72 +9353,29 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB133_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_min_i32_e32 v4, v5, v2 -; SI-NEXT: v_mov_b32_e32 v3, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB133_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[3:4] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB133_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_min_i32_e32 v0, v1, v2 -; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB133_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: global_atomic_smin v0, v[0:1], v2, off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB133_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8434,26 +9394,74 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB134_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB134_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB134_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB134_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB134_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8467,9 +9475,27 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB135_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB135_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8478,17 +9504,47 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB135_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB135_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB135_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8503,27 +9559,77 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB136_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB136_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8537,29 +9643,78 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB137_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB137_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8571,23 +9726,39 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB138_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB138_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8597,20 +9768,48 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB138_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB138_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB138_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8621,23 +9820,39 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB139_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB139_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8649,20 +9864,48 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB139_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB139_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB139_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8674,23 +9917,39 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB140_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB140_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8700,20 +9959,50 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB140_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB140_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB140_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8724,23 +10013,39 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB141_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB141_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8750,22 +10055,50 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB141_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB141_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB141_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8853,30 +10186,84 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-LABEL: global_atomic_udec_wrap_i32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB144_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB144_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB144_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB144_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB144_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8886,13 +10273,33 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB145_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB145_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8901,17 +10308,51 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB145_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB145_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB145_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8922,31 +10363,87 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-LABEL: global_atomic_udec_wrap_i32_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB146_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB146_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB146_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB146_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB146_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8956,33 +10453,88 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB147_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB147_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB147_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB147_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB147_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8994,23 +10546,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB148_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB148_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9020,20 +10591,54 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: .LBB148_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB148_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB148_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -9044,23 +10649,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB149_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB149_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9072,20 +10696,54 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: .LBB149_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB149_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB149_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -9097,23 +10755,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB150_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB150_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9123,20 +10800,56 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB150_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB150_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB150_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -9147,23 +10860,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB151_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB151_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9173,22 +10905,56 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: .LBB151_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB151_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: .LBB151_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB151_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 55a2dd0eb9a14..6cae0dfac7558 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -623,7 +623,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -695,7 +695,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -771,7 +771,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -853,7 +853,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -912,7 +912,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -983,7 +983,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1056,7 +1056,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1191,7 +1191,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1263,7 +1263,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1339,7 +1339,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1421,7 +1421,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1551,7 +1551,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1624,7 +1624,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1703,7 +1703,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1753,7 +1753,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1822,7 +1822,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -1892,7 +1892,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1971,7 +1971,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2092,7 +2092,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2159,7 +2159,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2285,7 +2285,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2354,7 +2354,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2503,7 +2503,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2556,7 +2556,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2624,7 +2624,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2691,7 +2691,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2767,7 +2767,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2817,7 +2817,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2886,7 +2886,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -2956,7 +2956,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3035,7 +3035,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3088,7 +3088,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3156,7 +3156,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3223,7 +3223,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3299,7 +3299,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3349,7 +3349,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3418,7 +3418,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3488,7 +3488,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3567,7 +3567,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3620,7 +3620,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3688,7 +3688,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3755,7 +3755,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3831,7 +3831,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst + %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -3887,7 +3887,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3959,7 +3959,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4035,7 +4035,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4117,7 +4117,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4176,7 +4176,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4247,7 +4247,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4320,7 +4320,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4399,7 +4399,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4455,7 +4455,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4510,7 +4510,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4637,7 +4637,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4713,7 +4713,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4795,7 +4795,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4854,7 +4854,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4925,7 +4925,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -4998,7 +4998,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5077,7 +5077,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -5133,7 +5133,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5205,7 +5205,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -5281,7 +5281,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5363,7 +5363,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -5422,7 +5422,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5493,7 +5493,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_endpgm entry: - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -5566,7 +5566,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5645,7 +5645,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index - %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -7146,7 +7146,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7218,7 +7218,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -7294,7 +7294,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7349,7 +7349,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7421,7 +7421,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 store i64 %tmp0, ptr addrspace(1) %out2 ret void } @@ -7497,6 +7497,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 - %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst + %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 59a99a6a0328d..f3b2ef8d69f9e 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -1329,26 +1329,76 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1362,9 +1412,29 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1373,17 +1443,47 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1394,32 +1494,88 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_sub_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1429,34 +1585,88 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_sub_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1468,25 +1678,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1494,24 +1722,54 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; VI-LABEL: global_atomic_sub_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1522,23 +1780,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1548,24 +1826,52 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1577,25 +1883,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v5 +; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1603,24 +1927,54 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-LABEL: global_atomic_sub_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v1 +; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1631,23 +1985,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v5 +; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1658,23 +2032,51 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v1 +; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1767,26 +2169,76 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB40_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB40_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB40_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB40_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1800,9 +2252,29 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1811,17 +2283,47 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -1832,32 +2334,88 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_and_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v6 +; SI-NEXT: v_and_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1867,34 +2425,88 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_and_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v6 +; SI-NEXT: v_and_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v7, v9, v3 +; VI-NEXT: v_and_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -1906,25 +2518,42 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: v_and_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1932,24 +2561,52 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; VI-LABEL: global_atomic_and_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1960,23 +2617,42 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: v_and_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1986,24 +2662,50 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2015,25 +2717,42 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v5 +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2041,24 +2760,52 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-LABEL: global_atomic_and_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_and_b32_e32 v5, s7, v7 +; VI-NEXT: v_and_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2069,23 +2816,42 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v5 +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2096,23 +2862,49 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_and_b32_e32 v5, s7, v7 +; VI-NEXT: v_and_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -3197,26 +3989,76 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3230,9 +4072,29 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3241,17 +4103,47 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3262,32 +4154,88 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_or_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v6 +; SI-NEXT: v_or_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3297,34 +4245,88 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_or_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v6 +; SI-NEXT: v_or_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_or_b32_e32 v7, v9, v3 +; VI-NEXT: v_or_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_setpc_b64 s[30:31] -; +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: global_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3336,25 +4338,42 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: v_or_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB64_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3362,24 +4381,52 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; VI-LABEL: global_atomic_or_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB64_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3390,23 +4437,42 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: v_or_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB65_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3416,24 +4482,50 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3445,25 +4537,42 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v5 +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB66_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3471,24 +4580,52 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; VI-LABEL: global_atomic_or_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_or_b32_e32 v5, s7, v7 +; VI-NEXT: v_or_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3499,23 +4636,42 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v5 +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3526,23 +4682,49 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_or_b32_e32 v5, s7, v7 +; VI-NEXT: v_or_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3635,26 +4817,76 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3668,9 +4900,29 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3679,17 +4931,47 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3700,32 +4982,88 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_xor_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v6 +; SI-NEXT: v_xor_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3735,34 +5073,88 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_xor_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v6 +; SI-NEXT: v_xor_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_xor_b32_e32 v7, v9, v3 +; VI-NEXT: v_xor_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3774,25 +5166,42 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: v_xor_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3800,24 +5209,52 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; VI-LABEL: global_atomic_xor_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3828,23 +5265,42 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: v_xor_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3854,24 +5310,50 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3883,25 +5365,42 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v5 +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3909,24 +5408,52 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-LABEL: global_atomic_xor_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_xor_b32_e32 v5, s7, v7 +; VI-NEXT: v_xor_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3937,23 +5464,42 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v5 +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3964,23 +5510,49 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_xor_b32_e32 v5, s7, v7 +; VI-NEXT: v_xor_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -5347,30 +6919,9 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB92_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: v_mov_b32_e32 v9, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_smax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB92_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5379,49 +6930,17 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB92_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB92_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[2:3], off offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5432,91 +6951,34 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB93_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc -; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v8 -; SI-NEXT: v_mov_b32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v2, v10 -; SI-NEXT: v_mov_b32_e32 v3, v11 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_smax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB93_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB93_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB93_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6697,30 +8159,9 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB105_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: v_mov_b32_e32 v9, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_umax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB105_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6729,49 +8170,17 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB105_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB105_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[2:3], off offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6782,91 +8191,34 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB106_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc -; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v8 -; SI-NEXT: v_mov_b32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v2, v10 -; SI-NEXT: v_mov_b32_e32 v3, v11 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_umax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB106_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB106_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB106_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7683,30 +9035,9 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB115_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: v_mov_b32_e32 v9, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_umin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB115_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7715,49 +9046,17 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB115_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB115_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[2:3], off offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB115_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7768,91 +9067,34 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB116_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc -; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v8 -; SI-NEXT: v_mov_b32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v2, v10 -; SI-NEXT: v_mov_b32_e32 v3, v11 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_umin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB116_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB116_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB116_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB116_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9134,30 +10376,9 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB129_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: v_mov_b32_e32 v9, v5 -; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_smin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB129_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9166,49 +10387,17 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB129_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB129_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[2:3], off offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB129_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9219,91 +10408,34 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB130_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc -; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v8 -; SI-NEXT: v_mov_b32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v2, v10 -; SI-NEXT: v_mov_b32_e32 v3, v11 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_smin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB130_1 -; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB130_1: ; %atomicrmw.start -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB130_1 -; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB130_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9322,26 +10454,85 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB131_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB131_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB131_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB131_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9355,9 +10546,32 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB132_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB132_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9366,17 +10580,53 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB132_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB132_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9387,32 +10637,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_uinc_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB133_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB133_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB133_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB133_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB133_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9422,34 +10737,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB134_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB134_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB134_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB134_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB134_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9461,25 +10839,45 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB135_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB135_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9487,24 +10885,58 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB135_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB135_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB135_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9515,23 +10947,45 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB136_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9541,24 +10995,56 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB136_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9570,25 +11056,45 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB137_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9596,24 +11102,58 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB137_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9624,23 +11164,45 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB138_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB138_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9651,23 +11213,55 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB138_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB138_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB138_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9756,30 +11350,95 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_udec_wrap_i64_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB141_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB141_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB141_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB141_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB141_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9789,13 +11448,38 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB142_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB142_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9804,17 +11488,57 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB142_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB142_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB142_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9825,32 +11549,103 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_udec_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB143_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB143_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB143_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB143_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB143_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9860,34 +11655,103 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB144_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB144_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB144_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB144_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB144_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9899,25 +11763,49 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB145_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB145_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9925,24 +11813,66 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB145_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB145_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB145_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9953,23 +11883,49 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB146_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB146_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9979,24 +11935,64 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: .LBB146_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB146_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB146_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -10008,25 +12004,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB147_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB147_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10034,24 +12054,66 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB147_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB147_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB147_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -10062,23 +12124,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB148_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB148_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10089,23 +12177,63 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: .LBB148_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB148_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB148_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll index 28aa76ab12f37..057c09e9a255c 100644 --- a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll @@ -48,7 +48,7 @@ define i32 @global_agent_release_idempotent_or(ptr addrspace(1) %in) { ; GFX942-NEXT: s_setpc_b64 s[30:31] ; OPT-LABEL: @global_agent_release_idempotent_or( ; OPT-NEXT: entry: -; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4 +; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4 ; OPT-NEXT: ret i32 [[VAL]] ; entry: @@ -56,6 +56,42 @@ entry: ret i32 %val } +define i32 @global_agent_release_idempotent_or_no_remote(ptr addrspace(1) %in) { +; GFX942-LABEL: global_agent_release_idempotent_or_no_remote: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; OPT-LABEL: @global_agent_release_idempotent_or_no_remote( +; OPT-NEXT: entry: +; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.remote.memory [[META0:![0-9]+]] +; OPT-NEXT: ret i32 [[VAL]] +entry: + %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.remote.memory !0 + ret i32 %val +} + +define i32 @global_agent_release_idempotent_or_no_fine_grained(ptr addrspace(1) %in) { +; GFX942-LABEL: global_agent_release_idempotent_or_no_fine_grained: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; OPT-LABEL: @global_agent_release_idempotent_or_no_fine_grained( +; OPT-NEXT: entry: +; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; OPT-NEXT: ret i32 [[VAL]] +entry: + %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.fine.grained.memory !0 + ret i32 %val +} + define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) { ; GFX942-LABEL: global_agent_acquire_release_idempotent_or: ; GFX942: ; %bb.0: ; %entry @@ -68,7 +104,7 @@ define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) { ; GFX942-NEXT: s_setpc_b64 s[30:31] ; OPT-LABEL: @global_agent_acquire_release_idempotent_or( ; OPT-NEXT: entry: -; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4 +; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4 ; OPT-NEXT: ret i32 [[VAL]] ; entry: @@ -88,9 +124,8 @@ define i32 @global_agent_acquire_release_idempotent_or__no_fine_grained(ptr addr ; GFX942-NEXT: s_setpc_b64 s[30:31] ; OPT-LABEL: @global_agent_acquire_release_idempotent_or__no_fine_grained( ; OPT-NEXT: entry: -; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; OPT-NEXT: ret i32 [[VAL]] -; entry: %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory !0 ret i32 %val @@ -108,7 +143,7 @@ define i32 @global_agent_seq_cst_idempotent_or(ptr addrspace(1) %in) { ; GFX942-NEXT: s_setpc_b64 s[30:31] ; OPT-LABEL: @global_agent_seq_cst_idempotent_or( ; OPT-NEXT: entry: -; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") seq_cst, align 4 +; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") seq_cst, align 4 ; OPT-NEXT: ret i32 [[VAL]] ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll index c48613b4bf3bf..2a693e1001cd9 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll @@ -276,23 +276,23 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ;. ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V4: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V5: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V6: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll index 9d98c70196dff..d09b4fda1b697 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll @@ -68,6 +68,6 @@ if.end: ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index 1e56f6f8e2c41..4ae0ba065adfc 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -7,7 +7,7 @@ declare void @extern() #0 define float @foo(float %x) #0 { ; GCN-LABEL: define float @foo( -; GCN-SAME: float [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; GCN-SAME: float [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; GCN-NEXT: [[ENTRY:.*:]] ; GCN-NEXT: tail call void @extern() ; GCN-NEXT: [[MUL:%.*]] = fmul float [[X]], 1.500000e+01 @@ -21,7 +21,7 @@ entry: define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 { ; GCN-LABEL: define amdgpu_kernel void @caller( -; GCN-SAME: ptr addrspace(1) captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; GCN-SAME: ptr addrspace(1) captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { ; GCN-NEXT: [[ENTRY:.*:]] ; GCN-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(1) [[P]], align 4, !amdgpu.noclobber [[META0:![0-9]+]] ; GCN-NEXT: tail call void @extern() @@ -40,17 +40,14 @@ attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="t attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } ;. -; UNSAFE: attributes #[[ATTR0:[0-9]+]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; UNSAFE: attributes #[[ATTR1]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; UNSAFE: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } ;. -; NONANS: attributes #[[ATTR0:[0-9]+]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NONANS: attributes #[[ATTR1]] = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NONANS: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } ;. -; NOINFS: attributes #[[ATTR0:[0-9]+]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #[[ATTR1]] = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } ;. ; UNSAFE: [[META0]] = !{} ;. diff --git a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll index 7af6d3a0c640e..8e87256c24ce5 100644 --- a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll @@ -55,8 +55,8 @@ define amdgpu_kernel void @issue120256_private(ptr addrspace(1) %out) { ; FIXME: Inference of amdgpu-no-queue-ptr should not depend on code object version. !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx803" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx803" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 1c298014e33e7..300124848c1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_i32 s0, s32, 0xf0 -; CHECK-NEXT: s_add_i32 s1, s32, 0xf4 -; CHECK-NEXT: s_add_i32 s2, s32, 0xf8 -; CHECK-NEXT: s_add_i32 s3, s32, 0xfc +; CHECK-NEXT: s_movk_i32 s1, 0xf4 +; CHECK-NEXT: s_movk_i32 s2, 0xf8 +; CHECK-NEXT: s_movk_i32 s3, 0xfc +; CHECK-NEXT: s_movk_i32 s34, 0x100 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_add_i32 s34, s32, 0x100 -; CHECK-NEXT: s_add_i32 s35, s32, 0x104 -; CHECK-NEXT: s_add_i32 s36, s32, 0x108 -; CHECK-NEXT: s_add_i32 s37, s32, 0x110 -; CHECK-NEXT: s_add_i32 s38, s32, 0x120 +; CHECK-NEXT: s_movk_i32 s35, 0x104 +; CHECK-NEXT: s_movk_i32 s36, 0x108 +; CHECK-NEXT: s_movk_i32 s37, 0x110 +; CHECK-NEXT: s_movk_i32 s38, 0x120 +; CHECK-NEXT: s_add_i32 s0, s32, 0xf0 +; CHECK-NEXT: s_add_i32 s1, s32, s1 +; CHECK-NEXT: s_add_i32 s2, s32, s2 +; CHECK-NEXT: s_add_i32 s3, s32, s3 +; CHECK-NEXT: s_add_i32 s34, s32, s34 +; CHECK-NEXT: s_add_i32 s35, s32, s35 +; CHECK-NEXT: s_add_i32 s36, s32, s36 +; CHECK-NEXT: s_add_i32 s37, s32, s37 +; CHECK-NEXT: s_add_i32 s38, s32, s38 ; CHECK-NEXT: s_or_b32 s39, s32, 4 ; CHECK-NEXT: s_or_b32 s40, s32, 8 ; CHECK-NEXT: s_or_b32 s41, s32, 12 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index b77b2f7441a0c..1ec4f250f8726 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s @@ -19,6 +21,30 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbyte_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbyte_b32 v0, s0, s1, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 @@ -73,6 +99,41 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_alignbyte_b32 v1, v1, v2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32_2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32_2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 4b113d80dd0e9..9274357c42038 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -168,7 +168,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) @@ -179,7 +179,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) @@ -213,7 +213,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -225,7 +225,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -259,7 +259,7 @@ define float @test_cvt_scalef32_f32_fp8_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1) ret float %ret @@ -269,7 +269,7 @@ define float @test_cvt_scalef32_f32_fp8_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2) ret float %ret @@ -300,7 +300,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) @@ -311,7 +311,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) @@ -345,7 +345,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -357,7 +357,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi(i32 %src, float %scale ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -391,7 +391,7 @@ define float @test_cvt_scalef32_f32_bf8_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1) ret float %ret @@ -401,7 +401,7 @@ define float @test_cvt_scalef32_f32_bf8_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2) ret float %ret @@ -773,7 +773,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f32_fp4_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1) ret <2 x float> %ret @@ -783,7 +783,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f32_fp4_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2) ret <2 x float> %ret @@ -813,7 +813,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1(i32 %old, float %src0, float %src1, flo ; GCN-LABEL: test_cvt_scale_fp4_f32_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1) ret i32 %ret @@ -823,7 +823,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2(i32 %old, float %src0, float %src1, flo ; GCN-LABEL: test_cvt_scale_fp4_f32_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2) ret i32 %ret @@ -895,7 +895,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f16_fp4_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1) ret <2 x half> %ret @@ -905,7 +905,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_f16_fp4_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2) ret <2 x half> %ret @@ -935,7 +935,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1) ret <2 x bfloat> %ret @@ -945,7 +945,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2(i32 %src, float %scale) { ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2) ret <2 x bfloat> %ret @@ -1302,7 +1302,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1314,7 +1314,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2(<2 x half> %src0, float %scale, i32 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1380,7 +1380,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1392,7 +1392,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2(<2 x bfloat> %src0, float %scale, i ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1602,7 +1602,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) @@ -1613,7 +1613,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) @@ -1647,7 +1647,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1659,7 +1659,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1] +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1693,7 +1693,7 @@ define float @test_cvt_scalef32_f32_fp8_byte1_inreg_src(i32 inreg %src, float %s ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1) ret float %ret @@ -1703,7 +1703,7 @@ define float @test_cvt_scalef32_f32_fp8_byte2_inreg_src(i32 inreg %src, float %s ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2) ret float %ret @@ -1734,7 +1734,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,0] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) @@ -1745,7 +1745,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,0] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) @@ -1779,7 +1779,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,1] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1791,7 +1791,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src(i32 inreg %s ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,1] +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1825,7 +1825,7 @@ define float @test_cvt_scalef32_f32_bf8_byte1_inreg_src(i32 inreg %src, float %s ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1) ret float %ret @@ -1835,7 +1835,7 @@ define float @test_cvt_scalef32_f32_bf8_byte2_inreg_src(i32 inreg %src, float %s ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2) ret float %ret @@ -2032,7 +2032,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte1_inreg_src(i32 inreg %src, float ; GCN-LABEL: test_cvt_scale_f32_fp4_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1) ret <2 x float> %ret @@ -2042,7 +2042,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte2_inreg_src(i32 inreg %src, float ; GCN-LABEL: test_cvt_scale_f32_fp4_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2) ret <2 x float> %ret @@ -2072,7 +2072,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0, ; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1) ret i32 %ret @@ -2082,7 +2082,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0, ; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2) ret i32 %ret @@ -2112,7 +2112,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte1_inreg_src(i32 inreg %src, float ; GCN-LABEL: test_cvt_scale_f16_fp4_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1) ret <2 x half> %ret @@ -2122,7 +2122,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte2_inreg_src(i32 inreg %src, float ; GCN-LABEL: test_cvt_scale_f16_fp4_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2) ret <2 x half> %ret @@ -2152,7 +2152,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1_inreg_src(i32 inreg %src, flo ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,0,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1) ret <2 x bfloat> %ret @@ -2162,7 +2162,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2_inreg_src(i32 inreg %src, flo ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[0,1,0] ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2) ret <2 x bfloat> %ret @@ -2515,7 +2515,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, fl ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2527,7 +2527,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, fl ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2562,7 +2562,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0, ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2574,7 +2574,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0, ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll index d3851b1a084d6..b2498f28486dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll @@ -28,7 +28,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_1(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -42,7 +42,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_2(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -84,7 +84,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -98,7 +98,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -140,7 +140,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -154,7 +154,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -196,7 +196,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_1(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -210,7 +210,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_2(ptr addrspace(1) ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -252,7 +252,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -266,7 +266,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -308,7 +308,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -322,7 +322,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v5, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll index 7433f6611cd9b..2874ef0396d90 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll @@ -25,7 +25,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_1(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -39,7 +39,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_2(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -81,7 +81,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_1(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -95,7 +95,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_2(ptr addrspace(1) % ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v5, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v6, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -137,7 +137,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_1(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1] ; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 @@ -151,7 +151,7 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_2(ptr addrspace(1) %o ; GFX950: ; %bb.0: ; GFX950-NEXT: global_load_dword v6, v[0:1], off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1] +; GFX950-NEXT: v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0] ; GFX950-NEXT: global_store_dword v[0:1], v7, off ; GFX950-NEXT: s_endpgm %old = load i32, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index aad6e031aa9ed..b07dec326327e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -536,9 +536,12 @@ ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 + ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 + ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 + ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 + ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 + ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] @@ -547,9 +550,12 @@ ; GCN-NEXT: v_exp_f32_e32 v58, v58 ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 + ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 + ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 + ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 + ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 + ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] @@ -561,8 +567,10 @@ ; GCN-NEXT: v_exp_f32_e32 v59, v57 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 + ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 + ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 + ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 04ee0bbd17673..eee23a15af503 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1425,9 +1425,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a1, v17 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1447,8 +1448,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a1, v17 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1468,8 +1470,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a1, v17 ; GCN-NEXT: v_accvgpr_write_b32 a2, v18 ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1567,8 +1570,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1592,8 +1596,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1621,8 +1626,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1646,8 +1652,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1675,8 +1682,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1700,8 +1708,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1721,8 +1730,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1750,8 +1760,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a1, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s23 +; SDAG-NEXT: v_mov_b32_e32 v9, s24 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1775,8 +1786,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a1, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s23 +; GISEL-NEXT: v_mov_b32_e32 v9, s24 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1789,22 +1801,43 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v20, -2 +; SDAG-NEXT: v_mov_b32_e32 v21, 33 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 33 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <4 x float> %result } @@ -1813,13 +1846,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v20, -2 +; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1836,8 +1870,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1853,14 +1888,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v21, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1895,7 +1930,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -1915,16 +1950,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v21, s12 +; SDAG-NEXT: v_mov_b32_e32 v22, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1937,20 +1973,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v20, s28 +; GISEL-NEXT: v_mov_b32_e32 v21, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1962,9 +1997,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: s_movk_i32 s6, 0x41 -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v21, -2 +; SDAG-NEXT: v_mov_b32_e32 v22, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -1974,7 +2010,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -1983,41 +2019,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v21, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2029,9 +2062,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: s_movk_i32 s6, 0x41 -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v21, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -2041,7 +2075,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -2050,41 +2084,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 -; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v21, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2096,8 +2127,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v21, -2 +; SDAG-NEXT: v_mov_b32_e32 v22, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -2107,7 +2140,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -2116,41 +2149,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v20, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v21, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2162,8 +2192,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; SDAG-NEXT: v_mov_b32_e32 v21, 0.15915494 +; SDAG-NEXT: v_mov_b32_e32 v22, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 @@ -2173,7 +2205,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v5, s13 ; SDAG-NEXT: v_mov_b32_e32 v6, s14 ; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; SDAG-NEXT: v_mov_b32_e32 v8, s16 ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 @@ -2182,41 +2214,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b32_e32 v13, s21 ; SDAG-NEXT: v_mov_b32_e32 v14, s22 ; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 +; GISEL-NEXT: v_mov_b32_e32 v20, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v21, 0.15915494 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2268,43 +2297,85 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v20, 1 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mov_b32_e32 v17, 1 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <4 x float> %result } define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_mov_b32_e32 v16, 1 +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <4 x float> %result } @@ -2559,5 +2630,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 91197f915b659..b977cc9f2117e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -3387,10 +3387,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a12, v28 ; GCN-NEXT: v_accvgpr_write_b32 a13, v29 ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -3436,9 +3437,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a12, v28 ; GCN-NEXT: v_accvgpr_write_b32 a13, v29 ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -3484,9 +3486,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_accvgpr_write_b32 a12, v28 ; GCN-NEXT: v_accvgpr_write_b32 a13, v29 ; GCN-NEXT: v_accvgpr_write_b32 a14, v30 +; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -3659,8 +3662,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3709,8 +3713,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3763,8 +3768,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3813,8 +3819,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3867,8 +3874,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a13, v21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, v22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v23 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3917,8 +3925,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a13, v21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, v22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, v23 +; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3963,8 +3972,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_accvgpr_write_b32 a13, s25 ; GCN-NEXT: v_accvgpr_write_b32 a14, s26 ; GCN-NEXT: v_accvgpr_write_b32 a15, s27 +; GCN-NEXT: v_mov_b32_e32 v17, s28 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 @@ -4114,48 +4124,95 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 33 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 33 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2) ret <16 x float> %result } @@ -4165,7 +4222,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4183,7 +4241,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4210,6 +4268,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 ; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4227,7 +4286,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -4257,7 +4316,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4275,7 +4335,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, 1.0 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4302,6 +4362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 ; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4319,7 +4380,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, 1.0 op_sel_hi:[1,1,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -4345,106 +4406,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1.0, -2 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0.15915494, 1.0 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) - ret <16 x float> %result -} - -define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: scratch_load_dword a15, off, s32 -; SDAG-NEXT: s_movk_i32 s0, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v31, -2 +; SDAG-NEXT: v_mov_b32_e32 v32, 1.0 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4462,7 +4429,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4484,12 +4451,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: scratch_load_dword a15, off, s32 -; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d +; GISEL-NEXT: v_mov_b32_e32 v31, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v32, -2 ; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 @@ -4528,103 +4495,277 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 ; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 ; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2) ret <16 x float> %result } -define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v16, s1 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v32, 1.0 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d +; SDAG-NEXT: v_mov_b32_e32 v32, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77) + ret <16 x float> %result +} + +define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 { +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v32, s0 +; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 +; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b32_e32 v32, s0 +; GISEL-NEXT: v_mov_b32_e32 v33, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4636,94 +4777,80 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: s_movk_i32 s2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v32, -2 +; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v33, -2 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4770,9 +4897,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 @@ -4844,10 +4972,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -4885,6 +5014,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v16, 42 +; SDAG-NEXT: v_mov_b32_e32 v17, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 @@ -4921,7 +5052,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 @@ -4965,9 +5096,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 25 +; GISEL-NEXT: v_mov_b32_e32 v17, 42 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] @@ -4993,10 +5124,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] @@ -5012,7 +5144,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -5033,78 +5165,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v44, s24 +; SDAG-NEXT: v_mov_b32_e32 v45, s25 +; SDAG-NEXT: v_mov_b32_e32 v46, s26 +; SDAG-NEXT: v_mov_b32_e32 v47, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5112,61 +5238,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a30, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a29, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a28, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a27, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a26, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a25, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a24, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a23, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a22, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a21, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a20, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a19, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5179,79 +5289,73 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 +; SDAG-NEXT: v_mov_b32_e32 v32, 42 +; SDAG-NEXT: v_mov_b32_e32 v33, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5259,61 +5363,54 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b32_e32 v32, 25 +; GISEL-NEXT: v_mov_b32_e32 v33, 42 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -5417,95 +5514,189 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 1 +; SDAG-NEXT: v_mov_b32_e32 v32, 0 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 0 +; GISEL-NEXT: v_mov_b32_e32 v32, 1 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) ret <16 x float> %result } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) { -; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_dword a15, off, s32 -; GCN-NEXT: v_accvgpr_write_b32 a0, v16 -; GCN-NEXT: v_accvgpr_write_b32 a1, v17 -; GCN-NEXT: v_accvgpr_write_b32 a2, v18 -; GCN-NEXT: v_accvgpr_write_b32 a3, v19 -; GCN-NEXT: v_accvgpr_write_b32 a4, v20 -; GCN-NEXT: v_accvgpr_write_b32 a5, v21 -; GCN-NEXT: v_accvgpr_write_b32 a6, v22 -; GCN-NEXT: v_accvgpr_write_b32 a7, v23 -; GCN-NEXT: v_accvgpr_write_b32 a8, v24 -; GCN-NEXT: v_accvgpr_write_b32 a9, v25 -; GCN-NEXT: v_accvgpr_write_b32 a10, v26 -; GCN-NEXT: v_accvgpr_write_b32 a11, v27 -; GCN-NEXT: v_accvgpr_write_b32 a12, v28 -; GCN-NEXT: v_accvgpr_write_b32 a13, v29 -; GCN-NEXT: v_accvgpr_write_b32 a14, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: scratch_load_dword a15, off, s32 +; SDAG-NEXT: v_mov_b32_e32 v31, 0 +; SDAG-NEXT: v_mov_b32_e32 v32, 1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 +; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 +; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 +; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 +; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 +; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 +; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 +; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 +; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 +; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 +; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 +; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 +; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 +; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 +; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 +; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: scratch_load_dword a15, off, s32 +; GISEL-NEXT: v_mov_b32_e32 v31, 1 +; GISEL-NEXT: v_mov_b32_e32 v32, 0 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 +; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 +; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 +; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 +; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 +; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 +; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 +; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 +; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 +; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 +; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 +; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 +; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 +; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 +; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 +; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 +; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0) ret <16 x float> %result } @@ -6302,6 +6493,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index b80c259c255d2..db80f5479d36b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -74,7 +74,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x5000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 @@ -175,7 +176,9 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_add_i32 s0, s33, 0x5000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_add_i32 s1, s33, s0 +; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 @@ -223,30 +226,35 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 ; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: s_movk_i32 s5, 0x12d4 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1 +; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 +; MUBUF-NEXT: s_movk_i32 s5, 0x12d0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 ; MUBUF-NEXT: s_movk_i32 s4, 0x4000 ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1 +; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 +; MUBUF-NEXT: s_movk_i32 s5, 0x12c4 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 ; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0 ; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1 -; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3 +; MUBUF-NEXT: s_movk_i32 s4, 0x12cc +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3 +; MUBUF-NEXT: s_movk_i32 s4, 0x12c8 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000 ; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6 +; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6 ; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc @@ -298,7 +306,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll new file mode 100644 index 0000000000000..8fec92ca8cfd9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Regression test for issue 160181 +; One variable is chosen to be assigned at zero. Here, that's @both +; Then other variables should be allocated at fixed offsets from that provided +; they are allocated by all the other kernels that presently allocate the +; variable at address zero. +; The failure mode was in that second check - variables could be added to +; the module scope zero address struct even when some of the kernels allocating +; that struct do not need the additional variable. + +; With current llvm, all three of these integers are put in the module scope struct, when +; neither kern_one or kern_two access all three. + +@both = addrspace(3) global i32 poison +@both_second = addrspace(3) global i16 poison ; a second field in the module struct +@one = addrspace(3) global i32 poison +@two = addrspace(3) global i32 poison + + +;. +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]] +; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]] + +;. +define void @func_one() { +; CHECK-LABEL: define {{[^@]+}}@func_one() { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]] +; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4 +; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4 +; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]] +; CHECK-NEXT: ret void +; + %val0 = load i32, ptr addrspace(3) @both + store i32 %val0, ptr addrspace(3) @one + store i16 10, ptr addrspace(3) @both_second + ret void +} + +define amdgpu_kernel void @kern_one() { +; CHECK-LABEL: define {{[^@]+}}@kern_one +; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]] +; CHECK-NEXT: call void @func_one() +; CHECK-NEXT: ret void +; +entry: + call void @func_one() + ret void +} + +define void @func_two() { +; CHECK-LABEL: define {{[^@]+}}@func_two() { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]] +; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4 +; CHECK-NEXT: [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4 +; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]] +; CHECK-NEXT: ret void +; + %val0 = load i32, ptr addrspace(3) @both + store i32 %val0, ptr addrspace(3) @two + store i16 20, ptr addrspace(3) @both_second + ret void +} + +define amdgpu_kernel void @kern_two() { +; CHECK-LABEL: define {{[^@]+}}@kern_two +; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] +; CHECK-NEXT: call void @func_two() +; CHECK-NEXT: ret void +; +entry: + call void @func_two() + ret void +} + +; Unrelated to the bug at hand, but if a variable is only +; reachable from a single kernel, it gets allocated to a fixed +; address independent of the module scope struct. This kernel +; means the key variables miss that optimisation while @both +; remains the best candidate for address zero allocation. +define void @func_block_direct_allocation() { +; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4 +; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4 +; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4 +; CHECK-NEXT: [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) +; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]] +; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]] +; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]] +; CHECK-NEXT: ret void +; + %val1 = load i32, ptr addrspace(3) @one + %val2 = load i32, ptr addrspace(3) @two + %sum = add i32 %val1, %val2 + store i32 %sum, ptr addrspace(3) @both + store i16 30, ptr addrspace(3) @both_second + ret void +} + +define amdgpu_kernel void @kern_block_direct_allocation() { +; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation +; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @func_block_direct_allocation() +; CHECK-NEXT: call void @func_one() +; CHECK-NEXT: call void @func_two() +; CHECK-NEXT: ret void +; + call void @func_block_direct_allocation() + call void @func_one() + call void @func_two() + ret void +} +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll index cd428be729ae2..966e5c8f460dc 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll @@ -35,7 +35,8 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { ; CHECK-LABEL: {{^}}name: realign_stack ; CHECK: scratchReservedForDynamicVGPRs: 512 %v = alloca <32 x i32>, align 128, addrspace(5) - store <32 x i32> %x, ptr addrspace(5) %v + ; use volatile store to avoid promotion of alloca to registers + store volatile <32 x i32> %x, ptr addrspace(5) %v call amdgpu_gfx void @callee(i32 71) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index 4585eca8fe894..c01c2be23b83f 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -157,19 +157,19 @@ name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -180,18 +180,18 @@ name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__c tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -202,19 +202,19 @@ name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -225,18 +225,18 @@ name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_v tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 1 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -247,18 +247,18 @@ name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_ tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz0_blgp0____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 3 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... @@ -269,17 +269,17 @@ name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_ tracksRegLiveness: true body: | bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-LABEL: name: V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64___xdl_write_vgpr__cbsz2_blgp2____xdl_read_overlap_vgpr_srcC - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ... diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll index 37a261cab7563..e8bd640aa5409 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) { ; MUBUF-LABEL: memcpy_fixed_align: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 ; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off ; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 +; MUBUF-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24 ; MUBUF-NEXT: s_lshr_b32 s4, s32, 6 ; MUBUF-NEXT: s_waitcnt vmcnt(2) -; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 -; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 -; MUBUF-NEXT: s_waitcnt vmcnt(3) ; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 -; MUBUF-NEXT: s_waitcnt vmcnt(6) +; MUBUF-NEXT: s_waitcnt vmcnt(5) ; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16 +; MUBUF-NEXT: s_waitcnt vmcnt(8) +; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:36 +; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:32 +; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:28 +; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; use s4 ; MUBUF-NEXT: ;;#ASMEND @@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off ; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 -; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 +; FLATSCR-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24 ; FLATSCR-NEXT: s_mov_b32 s0, s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32 +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s32 offset:24 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s0 ; FLATSCR-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0003366f3a3ea..5b7c36559a366 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v12, s3 -; CHECK-NEXT: v_mov_b32_e32 v11, s2 -; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 -; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 -; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] -; CHECK-NEXT: v_mov_b32_e32 v12, s1 -; CHECK-NEXT: v_mov_b32_e32 v11, s0 +; CHECK-NEXT: v_mov_b32_e32 v9, s3 +; CHECK-NEXT: v_mov_b32_e32 v8, s2 +; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32 +; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9] +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16 +; CHECK-NEXT: v_mov_b32_e32 v9, s1 +; CHECK-NEXT: v_mov_b32_e32 v8, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 -; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 -; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32 +; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: v_mov_b32_e32 v26, s0 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 ; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 ; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 -; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 @@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] @@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 ; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 -; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 -; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96 +; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112 ; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 @@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v12, s3 -; CHECK-NEXT: v_mov_b32_e32 v11, s2 -; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 -; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 -; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] -; CHECK-NEXT: v_mov_b32_e32 v12, s1 -; CHECK-NEXT: v_mov_b32_e32 v11, s0 +; CHECK-NEXT: v_mov_b32_e32 v9, s3 +; CHECK-NEXT: v_mov_b32_e32 v8, s2 +; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32 +; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9] +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16 +; CHECK-NEXT: v_mov_b32_e32 v9, s1 +; CHECK-NEXT: v_mov_b32_e32 v8, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 -; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 -; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 -; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32 +; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39 +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: v_mov_b32_e32 v26, s0 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 ; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 ; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 -; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 @@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] @@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 ; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 -; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 -; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96 +; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112 ; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index b43ccc551ca95..048610184368d 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -27,19 +27,16 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -83,19 +80,16 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -239,19 +233,16 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 -; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -295,19 +286,16 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 -; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -437,7 +425,7 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[2:5], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -451,19 +439,15 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 -; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 +; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -475,8 +459,8 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 -; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) @@ -492,7 +476,7 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[2:5], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -506,19 +490,15 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 -; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 +; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -530,8 +510,8 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 -; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) @@ -643,12 +623,9 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -660,24 +637,16 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 -; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -689,18 +658,13 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 -; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -712,12 +676,9 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -729,24 +690,16 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 -; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -758,18 +711,13 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 -; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -895,22 +843,20 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -964,22 +910,20 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 -; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1161,15 +1105,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x2 -; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 -; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -1211,15 +1155,15 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x2 -; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 -; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -1929,18 +1873,18 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1994,18 +1938,18 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -3267,19 +3211,16 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 ; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -3334,19 +3275,16 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 ; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -3525,24 +3463,21 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 -; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16 +; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) @@ -3592,24 +3527,21 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 -; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16 +; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -3783,25 +3715,20 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 ; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 ; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -3850,25 +3777,20 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 ; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 ; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4037,24 +3959,21 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 -; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16 +; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -4104,24 +4023,21 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 -; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16 +; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4302,34 +4218,31 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -4398,34 +4311,31 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index f08ea27040fb5..01b7f40f6256f 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -471,7 +471,7 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[2:5], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[2:5], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) @@ -509,8 +509,8 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 -; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) @@ -526,7 +526,7 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[2:5], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -544,7 +544,7 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 -; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[2:5], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) @@ -564,8 +564,8 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 -; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) @@ -2077,21 +2077,23 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 -; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -2143,21 +2145,23 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 -; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 7eb44636f79d7..010642b75f5f7 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -47,7 +47,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 atomic: %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 100 - %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst + %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %ret, ptr addrspace(1) %out br label %exit @@ -87,7 +87,7 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs atomic: %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 100 - %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst + %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 br label %exit exit: @@ -96,3 +96,5 @@ exit: attributes #0 = { nounwind } attributes #1 = { nounwind readnone } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll new file mode 100644 index 0000000000000..6d0aa1e784530 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %val4, <16 x i64> %val16) { +; CHECK-LABEL: no_folding_imm_to_inst_with_fi: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: s_load_b256 s[36:43], s[4:5], 0x24 +; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4 +; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4 +; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base +; CHECK-NEXT: s_movk_i32 s33, 0x70 +; CHECK-NEXT: s_movk_i32 s34, 0x60 +; CHECK-NEXT: s_or_b32 s44, 0x80, s33 +; CHECK-NEXT: s_mov_b32 s45, s35 +; CHECK-NEXT: s_or_b32 s46, 0x80, s34 +; CHECK-NEXT: s_mov_b32 s47, s35 +; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 +; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47 +; CHECK-NEXT: s_movk_i32 s34, 0x80 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 +; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37 +; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39 +; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 +; CHECK-NEXT: s_movk_i32 s20, 0x50 +; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29 +; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_or_b32 s20, 0x80, s20 +; CHECK-NEXT: s_mov_b32 s21, s35 +; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; CHECK-NEXT: s_or_b32 s16, 0x80, 64 +; CHECK-NEXT: s_mov_b32 s17, s35 +; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; CHECK-NEXT: s_or_b32 s12, 0x80, 48 +; CHECK-NEXT: s_mov_b32 s13, s35 +; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; CHECK-NEXT: s_or_b32 s8, 0x80, 32 +; CHECK-NEXT: s_mov_b32 s9, s35 +; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; CHECK-NEXT: s_or_b32 s4, 0x80, 16 +; CHECK-NEXT: s_mov_b32 s5, s35 +; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16 +; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12 +; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8 +; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4 +; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 +; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 +; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_endpgm +bb: + %alloca = alloca <4 x i64>, align 32, addrspace(5) + %alloca1 = alloca <16 x i64>, align 128, addrspace(5) + store volatile <4 x i64> %val4, ptr addrspace(5) %alloca + %ascast = addrspacecast ptr addrspace(5) %alloca1 to ptr + store volatile <16 x i64> %val16, ptr %ascast + %load = load volatile <16 x i64>, ptr %ascast + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index fc360423599e3..89bcfb3b3a834 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -58,6 +58,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0 attributes #0 = { nounwind readnone speculatable willreturn } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll index ad42748ab3d60..c1123d7b515be 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll @@ -1,9 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --check-prefix=BASE --check-prefix=MAX16 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE %s --check-prefix=DEFAULT define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { +; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements( +; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX16-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX16-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; MAX16-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX24-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison +; MAX24-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX24-NEXT: ret void +; ; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements( ; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -19,12 +51,50 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 ; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 -; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; DEFAULT-NEXT: ret void ; -; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements( -; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %c1 = icmp uge i32 %x, 3 + %c2 = icmp uge i32 %y, 3 + %sel1 = select i1 %c1, i32 1, i32 2 + %sel2 = select i1 %c2, i32 0, i32 %sel1 + %alloca = alloca [24 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) + %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + store i32 42, ptr addrspace(5) %gep.0 + store i32 43, ptr addrspace(5) %gep.1 + %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %load = load i32, ptr addrspace(5) %gep + store i32 %load, ptr %out + ret void +} + +define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { +; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5) +; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) +; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 +; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX16-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX16-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 +; MAX16-NEXT: ret void +; +; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { ; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 @@ -36,18 +106,18 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; MAX24-NEXT: ret void ; -; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements( -; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison -; MAX32-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] -; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 -; MAX32-NEXT: ret void +; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( +; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; DEFAULT-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison +; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] +; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; DEFAULT-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() @@ -67,18 +137,24 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ret void } -define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { -; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib( -; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { +; BASE-LABEL: define amdgpu_kernel void @i32_32_elements( +; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { ; BASE-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; BASE-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; BASE-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; BASE-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; BASE-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; BASE-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; BASE-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison -; BASE-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> , i32 [[SEL2]] -; BASE-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; BASE-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; BASE-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; BASE-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; BASE-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; BASE-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; BASE-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; BASE-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; BASE-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; BASE-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; BASE-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -87,40 +163,40 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %alloca = alloca [24 x i32], align 16, addrspace(5) - call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false) - %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 - %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20 + %alloca = alloca [32 x i32], align 16, addrspace(5) + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false) + %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 + %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30 store i32 42, ptr addrspace(5) %gep.0 store i32 43, ptr addrspace(5) %gep.1 - %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 + %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 %load = load i32, ptr addrspace(5) %gep store i32 %load, ptr %out ret void } -define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { -; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements( -; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { -; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) -; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) -; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 -; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 -; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 -; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 -; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 -; DEFAULT-NEXT: ret void +define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { +; MAX16-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { +; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 +; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 +; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 +; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] +; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) +; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) +; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 +; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 +; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 +; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] +; MAX16-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; MAX16-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 +; MAX16-NEXT: ret void ; -; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements( -; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { +; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( +; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { ; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 @@ -138,38 +214,6 @@ define amdgpu_kernel void @i32_32_elements(ptr %out) #0 { ; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; MAX24-NEXT: ret void ; -; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements( -; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] { -; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <32 x i32> poison -; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] -; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 -; MAX32-NEXT: ret void -; - %x = tail call i32 @llvm.amdgcn.workitem.id.x() - %y = tail call i32 @llvm.amdgcn.workitem.id.y() - %c1 = icmp uge i32 %x, 3 - %c2 = icmp uge i32 %y, 3 - %sel1 = select i1 %c1, i32 1, i32 2 - %sel2 = select i1 %c2, i32 0, i32 %sel1 - %alloca = alloca [32 x i32], align 16, addrspace(5) - call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false) - %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0 - %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30 - store i32 42, ptr addrspace(5) %gep.0 - store i32 43, ptr addrspace(5) %gep.1 - %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 - %load = load i32, ptr addrspace(5) %gep - store i32 %load, ptr %out - ret void -} - -define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { ; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( ; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -182,38 +226,6 @@ define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 { ; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] ; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; DEFAULT-NEXT: ret void -; -; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( -; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { -; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX24-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5) -; MAX24-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false) -; MAX24-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 -; MAX24-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30 -; MAX24-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; MAX24-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 -; MAX24-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; MAX24-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 -; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 -; MAX24-NEXT: ret void -; -; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements_attrib( -; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] { -; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() -; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 -; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 -; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 -; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] -; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <32 x i32> poison -; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> , i32 [[SEL2]] -; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 -; MAX32-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() @@ -237,6 +249,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) -attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="16" } attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" } -attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" } +attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index aabd5df956837..ec04c6aa7f10d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --enable-var-scope %s declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0 declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index 13605a1f72305..606cd653084f6 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -204,11 +204,11 @@ attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index b87d266cc2514..02c76473591de 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -279,7 +279,7 @@ define amdgpu_kernel void @kernel_3_6() #12 { ; 3,6 -> 6,9 define internal void @refine_upper_func_3_6() #13 { ; CHECK-LABEL: define internal void @refine_upper_func_3_6 -; CHECK-SAME: () #[[ATTR9]] { +; CHECK-SAME: () #[[ATTR14:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -288,7 +288,7 @@ define internal void @refine_upper_func_3_6() #13 { ; 4,8 -> 6,8 define internal void @refine_lower_func_4_8() #14 { ; CHECK-LABEL: define internal void @refine_lower_func_4_8 -; CHECK-SAME: () #[[ATTR14:[0-9]+]] { +; CHECK-SAME: () #[[ATTR15:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: ret void ; @@ -298,7 +298,7 @@ define internal void @refine_lower_func_4_8() #14 { define amdgpu_kernel void @kernel_foo_6_8() #15 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8 -; CHECK-SAME: () #[[ATTR15:[0-9]+]] { +; CHECK-SAME: () #[[ATTR16:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @refine_lower_func_4_8() ; CHECK-NEXT: call void @func_9_10_a() @@ -313,7 +313,7 @@ define amdgpu_kernel void @kernel_foo_6_8() #15 { ; 5,5 -> 5,5 define internal void @func_5_5() #16 { ; CHECK-LABEL: define internal void @func_5_5 -; CHECK-SAME: () #[[ATTR16:[0-9]+]] { +; CHECK-SAME: () #[[ATTR17:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -322,7 +322,7 @@ define internal void @func_5_5() #16 { ; 5,8 -> 8,8 define internal void @func_5_8() #17 { ; CHECK-LABEL: define internal void @func_5_8 -; CHECK-SAME: () #[[ATTR17:[0-9]+]] { +; CHECK-SAME: () #[[ATTR18:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -331,7 +331,7 @@ define internal void @func_5_8() #17 { ; 9,10 -> 9,10 define internal void @func_9_10_a() #18 { ; CHECK-LABEL: define internal void @func_9_10_a -; CHECK-SAME: () #[[ATTR18:[0-9]+]] { +; CHECK-SAME: () #[[ATTR19:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -340,7 +340,7 @@ define internal void @func_9_10_a() #18 { ; 9,10 -> 9,9 define internal void @func_9_10_b() #18 { ; CHECK-LABEL: define internal void @func_9_10_b -; CHECK-SAME: () #[[ATTR18]] { +; CHECK-SAME: () #[[ATTR19]] { ; CHECK-NEXT: ret void ; ret void @@ -348,7 +348,7 @@ define internal void @func_9_10_b() #18 { define amdgpu_kernel void @kernel_bar_8_9() #19 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9 -; CHECK-SAME: () #[[ATTR19:[0-9]+]] { +; CHECK-SAME: () #[[ATTR20:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @func_5_5() ; CHECK-NEXT: call void @func_9_10_b() @@ -408,15 +408,16 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll index 6e9e4e47ffa0f..2b07fc716e8b9 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll @@ -19,5 +19,5 @@ define void @hoge() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 78f8090eae9d4..7fc2088475cc9 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -192,12 +192,12 @@ define amdgpu_kernel void @kernel_lds_recursion() { !1 = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll new file mode 100644 index 0000000000000..93c846e606278 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { +; GCN-LABEL: scratch_load_b32_alloca_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %p = alloca [64 x i32], align 4, addrspace(5) + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxadd + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(5) %p, i64 %idxadd + %ld = load i8, ptr addrspace(5) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <2 x float> %ret +} + +; Multiplication is unsigned here, so we cannot match it. + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + store float 1.0, ptr addrspace(5) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom + store i16 1, ptr addrspace(5) %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom + store double 1.0, ptr addrspace(5) %arrayidx, align 4 + ret void +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 15eb41a1a5b65..df496258a2509 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -404,12 +404,11 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_not_b32_e32 v5, v10 +; GCN-IR-NEXT: v_not_b32_e32 v4, v10 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v8 -; GCN-IR-NEXT: v_not_b32_e32 v4, 0 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v5, v11 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v4, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], -1, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll index 8ea83da78f889..159aede8d96ba 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -19,7 +19,7 @@ define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace( %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64 %shl = shl i64 %cast, 2 %castback = inttoptr i64 %shl to ptr addrspace(1) - %val = atomicrmw and ptr addrspace(1) %castback, i32 3 syncscope("agent") seq_cst + %val = atomicrmw and ptr addrspace(1) %castback, i32 3 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index 1bdb662afc648..98be7b480474a 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -70,8 +70,8 @@ entry: } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CW: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index efb9ff8fa04bc..dd65e9bb43954 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8, !noalias.addrspace [[META0:![0-9]+]] ; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; @@ -57,7 +57,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ;. +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10} ;. diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 6da7d1b7ee868..c4fcf14b284c3 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -4861,625 +4861,610 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: srem_v4i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[10:11] offset:32 -; GCN-NEXT: global_load_dwordx4 v[14:17], v8, s[10:11] -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[10:11] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[10:11] offset:16 +; GCN-NEXT: global_load_dwordx4 v[10:13], v4, s[10:11] offset:48 +; GCN-NEXT: global_load_dwordx4 v[14:17], v4, s[10:11] offset:32 +; GCN-NEXT: global_load_dwordx4 v[6:9], v4, s[10:11] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_readfirstlane_b32 s5, v11 -; GCN-NEXT: v_readfirstlane_b32 s4, v10 +; GCN-NEXT: v_readfirstlane_b32 s11, v13 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_readfirstlane_b32 s7, v15 -; GCN-NEXT: v_readfirstlane_b32 s6, v14 -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_cbranch_scc0 .LBB12_13 +; GCN-NEXT: v_readfirstlane_b32 s0, v15 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_or_b32_e32 v5, s0, v7 +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN-NEXT: v_readfirstlane_b32 s10, v12 +; GCN-NEXT: v_readfirstlane_b32 s13, v11 +; GCN-NEXT: v_readfirstlane_b32 s12, v10 +; GCN-NEXT: v_readfirstlane_b32 s15, v17 +; GCN-NEXT: v_readfirstlane_b32 s14, v16 +; GCN-NEXT: v_readfirstlane_b32 s16, v14 +; GCN-NEXT: s_cbranch_vccz .LBB12_13 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_ashr_i32 s0, s5, 31 -; GCN-NEXT: s_add_u32 s2, s4, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] -; GCN-NEXT: v_cvt_f32_u32_e32 v8, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 -; GCN-NEXT: v_rcp_f32_e32 v8, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GCN-NEXT: v_trunc_f32_e32 v9, v9 -; GCN-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_readfirstlane_b32 s2, v9 -; GCN-NEXT: v_readfirstlane_b32 s3, v8 -; GCN-NEXT: s_mul_i32 s5, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 -; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s5, s15, s5 -; GCN-NEXT: s_add_i32 s5, s5, s14 -; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 -; GCN-NEXT: s_mul_i32 s15, s3, s5 -; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 -; GCN-NEXT: s_mul_i32 s16, s2, s16 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 -; GCN-NEXT: s_addc_u32 s3, s14, s17 -; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s5, s2, s5 -; GCN-NEXT: s_add_u32 s3, s3, s5 -; GCN-NEXT: s_addc_u32 s5, 0, s14 -; GCN-NEXT: v_add_co_u32_e32 v8, vcc, s3, v8 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s5 -; GCN-NEXT: v_readfirstlane_b32 s5, v8 -; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 -; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s5 -; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s5 -; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 -; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s5, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s5, 0, s16 -; GCN-NEXT: s_add_u32 s0, s0, s15 -; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s5, s14 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mul_i32 s3, s2, s3 -; GCN-NEXT: s_add_u32 s0, s0, s3 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: v_add_co_u32_e32 v8, vcc, s0, v8 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s7, 31 -; GCN-NEXT: s_add_u32 s0, s6, s14 -; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s7, s14 -; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] -; GCN-NEXT: v_readfirstlane_b32 s3, v8 -; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s5, s1 -; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 -; GCN-NEXT: s_mul_i32 s3, s17, s3 -; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s7 -; GCN-NEXT: s_addc_u32 s1, s5, 0 -; GCN-NEXT: s_mul_i32 s2, s17, s2 -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_mul_hi_u32 s2, s12, s0 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s2, s13, s0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s5, s1, s2 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s5 -; GCN-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v8 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s7, s1, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v9, s[0:1], s12, v8 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s7, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s13 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v9 -; GCN-NEXT: s_cmp_eq_u32 s15, s13 -; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v11, s16 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s7, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v11, s[0:1], s12, v9 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s2, s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v10, s15 -; GCN-NEXT: v_mov_b32_e32 v11, s2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s5 -; GCN-NEXT: s_cmp_ge_u32 s0, s13 -; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 -; GCN-NEXT: s_cmp_eq_u32 s0, s13 -; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GCN-NEXT: v_mov_b32_e32 v14, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: v_mov_b32_e32 v14, s0 -; GCN-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc -; GCN-NEXT: v_xor_b32_e32 v8, s14, v8 -; GCN-NEXT: v_xor_b32_e32 v9, s14, v10 -; GCN-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v8 -; GCN-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v10, vcc +; GCN-NEXT: s_ashr_i32 s2, s0, 31 +; GCN-NEXT: s_add_u32 s4, s16, s2 +; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_addc_u32 s5, s0, s2 +; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GCN-NEXT: s_sub_u32 s0, 0, s4 +; GCN-NEXT: s_subb_u32 s1, 0, s5 +; GCN-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; GCN-NEXT: v_rcp_f32_e32 v4, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GCN-NEXT: v_trunc_f32_e32 v5, v5 +; GCN-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-NEXT: v_mul_lo_u32 v10, s0, v5 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v4 +; GCN-NEXT: v_mul_lo_u32 v13, s1, v4 +; GCN-NEXT: v_mul_lo_u32 v12, s0, v4 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v4, v12 +; GCN-NEXT: v_mul_lo_u32 v13, v4, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v4, v10 +; GCN-NEXT: v_mul_lo_u32 v14, v5, v12 +; GCN-NEXT: v_mul_hi_u32 v12, v5, v12 +; GCN-NEXT: v_mul_hi_u32 v16, v5, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v15, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v5, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v16, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc +; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v10, s0, v5 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v4 +; GCN-NEXT: v_mul_lo_u32 v12, s1, v4 +; GCN-NEXT: v_mul_lo_u32 v13, s0, v4 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v12 +; GCN-NEXT: v_mul_lo_u32 v14, v4, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v4, v13 +; GCN-NEXT: v_mul_hi_u32 v16, v4, v10 +; GCN-NEXT: v_mul_hi_u32 v12, v5, v13 +; GCN-NEXT: v_mul_lo_u32 v13, v5, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v5, v10 +; GCN-NEXT: v_add_co_u32_e32 v14, vcc, v15, v14 +; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v5, v10 +; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v14, v13 +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, v15, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v7 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v6, v10 +; GCN-NEXT: v_xor_b32_e32 v11, v11, v10 +; GCN-NEXT: v_mul_lo_u32 v12, v11, v5 +; GCN-NEXT: v_mul_hi_u32 v13, v11, v4 +; GCN-NEXT: v_mul_hi_u32 v14, v11, v5 +; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v10, vcc +; GCN-NEXT: v_xor_b32_e32 v7, v7, v10 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc +; GCN-NEXT: v_mul_lo_u32 v14, v7, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v7, v4 +; GCN-NEXT: v_mul_hi_u32 v15, v7, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v7, v5 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 +; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v4, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v15, vcc +; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v5, s4, v5 +; GCN-NEXT: v_mul_hi_u32 v12, s4, v4 +; GCN-NEXT: v_mul_lo_u32 v13, s5, v4 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v4 +; GCN-NEXT: v_add_u32_e32 v5, v12, v5 +; GCN-NEXT: v_add_u32_e32 v5, v5, v13 +; GCN-NEXT: v_sub_u32_e32 v12, v7, v5 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_sub_co_u32_e32 v4, vcc, v11, v4 +; GCN-NEXT: v_subb_co_u32_e64 v11, s[0:1], v12, v13, vcc +; GCN-NEXT: v_subrev_co_u32_e64 v12, s[0:1], s4, v4 +; GCN-NEXT: v_subbrev_co_u32_e64 v14, s[2:3], 0, v11, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v12 +; GCN-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_subrev_co_u32_e64 v13, s[0:1], s4, v12 +; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v5, vcc +; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; GCN-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v11, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v5 +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v4 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s5, v5 +; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v11, v14, v11, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GCN-NEXT: v_xor_b32_e32 v4, v4, v10 +; GCN-NEXT: v_xor_b32_e32 v5, v5, v10 +; GCN-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc ; GCN-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-NEXT: .LBB12_2: -; GCN-NEXT: v_cvt_f32_u32_e32 v8, s4 -; GCN-NEXT: s_sub_i32 s0, 0, s4 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_readfirstlane_b32 s2, v8 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s0, s2, s0 -; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_mul_hi_u32 s0, s6, s2 -; GCN-NEXT: s_mul_i32 s0, s0, s4 -; GCN-NEXT: s_sub_i32 s0, s6, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s4 -; GCN-NEXT: s_cmp_ge_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s4 -; GCN-NEXT: s_cmp_ge_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s16 +; GCN-NEXT: s_sub_i32 s0, 0, s16 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 +; GCN-NEXT: v_add_u32_e32 v4, v4, v5 +; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v4, s16 +; GCN-NEXT: v_sub_u32_e32 v4, v6, v4 +; GCN-NEXT: v_subrev_u32_e32 v5, s16, v4 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_subrev_u32_e32 v5, s16, v4 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: .LBB12_3: -; GCN-NEXT: v_or_b32_e32 v11, v17, v13 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_or_b32_e32 v7, s15, v9 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GCN-NEXT: s_cbranch_vccz .LBB12_14 ; GCN-NEXT: ; %bb.4: -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v13 -; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v12, v10 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v10, vcc +; GCN-NEXT: s_ashr_i32 s0, s15, 31 +; GCN-NEXT: s_add_u32 s2, s14, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_addc_u32 s3, s15, s0 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GCN-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GCN-NEXT: s_sub_u32 s0, 0, s4 +; GCN-NEXT: s_subb_u32 s1, 0, s5 +; GCN-NEXT: v_madmk_f32 v6, v7, 0x4f800000, v6 +; GCN-NEXT: v_rcp_f32_e32 v6, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GCN-NEXT: v_trunc_f32_e32 v7, v7 +; GCN-NEXT: v_madmk_f32 v6, v7, 0xcf800000, v6 +; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-NEXT: v_mul_lo_u32 v10, s0, v7 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v6 +; GCN-NEXT: v_mul_lo_u32 v13, s1, v6 +; GCN-NEXT: v_mul_lo_u32 v12, s0, v6 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v6, v12 +; GCN-NEXT: v_mul_lo_u32 v13, v6, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v6, v10 +; GCN-NEXT: v_mul_lo_u32 v14, v7, v12 +; GCN-NEXT: v_mul_hi_u32 v12, v7, v12 +; GCN-NEXT: v_mul_hi_u32 v16, v7, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v15, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v7, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v16, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc +; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v10, s0, v7 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v6 +; GCN-NEXT: v_mul_lo_u32 v12, s1, v6 +; GCN-NEXT: v_mul_lo_u32 v13, s0, v6 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v12 +; GCN-NEXT: v_mul_lo_u32 v14, v6, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v6, v13 +; GCN-NEXT: v_mul_hi_u32 v16, v6, v10 +; GCN-NEXT: v_mul_hi_u32 v12, v7, v13 +; GCN-NEXT: v_mul_lo_u32 v13, v7, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v7, v10 +; GCN-NEXT: v_add_co_u32_e32 v14, vcc, v15, v14 +; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v7, v10 +; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v14, v13 +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, v15, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v8, v10 ; GCN-NEXT: v_xor_b32_e32 v11, v11, v10 -; GCN-NEXT: v_xor_b32_e32 v10, v13, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v11 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v10 -; GCN-NEXT: v_sub_co_u32_e32 v15, vcc, 0, v11 -; GCN-NEXT: v_subb_co_u32_e32 v18, vcc, 0, v10, vcc -; GCN-NEXT: v_madmk_f32 v13, v14, 0x4f800000, v13 -; GCN-NEXT: v_rcp_f32_e32 v13, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 0x5f7ffffc, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 0x2f800000, v13 -; GCN-NEXT: v_trunc_f32_e32 v14, v14 -; GCN-NEXT: v_madmk_f32 v13, v14, 0xcf800000, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_mul_lo_u32 v20, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v19, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v21, v18, v13 -; GCN-NEXT: v_mul_lo_u32 v22, v15, v13 -; GCN-NEXT: v_add_u32_e32 v19, v19, v20 -; GCN-NEXT: v_add_u32_e32 v19, v19, v21 -; GCN-NEXT: v_mul_lo_u32 v20, v13, v19 -; GCN-NEXT: v_mul_hi_u32 v21, v13, v22 -; GCN-NEXT: v_mul_hi_u32 v23, v13, v19 -; GCN-NEXT: v_mul_hi_u32 v24, v14, v19 -; GCN-NEXT: v_mul_lo_u32 v19, v14, v19 -; GCN-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20 -; GCN-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v23, vcc -; GCN-NEXT: v_mul_lo_u32 v23, v14, v22 -; GCN-NEXT: v_mul_hi_u32 v22, v14, v22 -; GCN-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, v21, v22, vcc -; GCN-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v24, vcc -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v21, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v19 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v20, vcc -; GCN-NEXT: v_mul_lo_u32 v19, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v20, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v18, v18, v13 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v13 -; GCN-NEXT: v_add_u32_e32 v19, v20, v19 -; GCN-NEXT: v_add_u32_e32 v18, v19, v18 -; GCN-NEXT: v_mul_lo_u32 v21, v13, v18 -; GCN-NEXT: v_mul_hi_u32 v22, v13, v15 -; GCN-NEXT: v_mul_hi_u32 v23, v13, v18 -; GCN-NEXT: v_mul_hi_u32 v20, v14, v15 -; GCN-NEXT: v_mul_lo_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v19, v14, v18 -; GCN-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21 -; GCN-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v23, vcc -; GCN-NEXT: v_mul_lo_u32 v18, v14, v18 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v21, v15 -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, v22, v20, vcc -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v15, v18 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v15 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v18, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v17 -; GCN-NEXT: v_add_co_u32_e32 v18, vcc, v16, v15 -; GCN-NEXT: v_xor_b32_e32 v18, v18, v15 -; GCN-NEXT: v_mul_lo_u32 v19, v18, v14 -; GCN-NEXT: v_mul_hi_u32 v20, v18, v13 -; GCN-NEXT: v_mul_hi_u32 v21, v18, v14 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v15, vcc -; GCN-NEXT: v_xor_b32_e32 v17, v17, v15 -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v21, vcc -; GCN-NEXT: v_mul_lo_u32 v21, v17, v13 -; GCN-NEXT: v_mul_hi_u32 v13, v17, v13 -; GCN-NEXT: v_mul_hi_u32 v22, v17, v14 -; GCN-NEXT: v_mul_lo_u32 v14, v17, v14 -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v20, v13, vcc -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v22, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v14 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v14, v11, v14 -; GCN-NEXT: v_mul_hi_u32 v19, v11, v13 -; GCN-NEXT: v_mul_lo_u32 v20, v10, v13 -; GCN-NEXT: v_mul_lo_u32 v13, v11, v13 -; GCN-NEXT: v_add_u32_e32 v14, v19, v14 -; GCN-NEXT: v_add_u32_e32 v14, v14, v20 -; GCN-NEXT: v_sub_u32_e32 v19, v17, v14 -; GCN-NEXT: v_sub_co_u32_e32 v13, vcc, v18, v13 -; GCN-NEXT: v_subb_co_u32_e64 v18, s[0:1], v19, v10, vcc -; GCN-NEXT: v_sub_co_u32_e64 v19, s[0:1], v13, v11 -; GCN-NEXT: v_subbrev_co_u32_e64 v20, s[2:3], 0, v18, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v10 -; GCN-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v11 -; GCN-NEXT: v_subb_co_u32_e32 v14, vcc, v17, v14, vcc -; GCN-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], v20, v10 -; GCN-NEXT: v_subb_co_u32_e64 v18, s[0:1], v18, v10, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v14, v10 -; GCN-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] -; GCN-NEXT: v_sub_co_u32_e64 v22, s[0:1], v19, v11 -; GCN-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v13, v11 -; GCN-NEXT: v_subbrev_co_u32_e64 v18, s[0:1], 0, v18, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v14, v10 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 -; GCN-NEXT: v_cndmask_b32_e32 v10, v17, v11, vcc -; GCN-NEXT: v_cndmask_b32_e64 v19, v19, v22, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc -; GCN-NEXT: v_cndmask_b32_e32 v10, v14, v18, vcc -; GCN-NEXT: v_xor_b32_e32 v11, v11, v15 -; GCN-NEXT: v_xor_b32_e32 v13, v10, v15 -; GCN-NEXT: v_sub_co_u32_e32 v10, vcc, v11, v15 -; GCN-NEXT: v_subb_co_u32_e32 v11, vcc, v13, v15, vcc +; GCN-NEXT: v_mul_lo_u32 v12, v11, v7 +; GCN-NEXT: v_mul_hi_u32 v13, v11, v6 +; GCN-NEXT: v_mul_hi_u32 v14, v11, v7 +; GCN-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v10, vcc +; GCN-NEXT: v_xor_b32_e32 v9, v9, v10 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc +; GCN-NEXT: v_mul_lo_u32 v14, v9, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v9, v6 +; GCN-NEXT: v_mul_hi_u32 v15, v9, v7 +; GCN-NEXT: v_mul_lo_u32 v7, v9, v7 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 +; GCN-NEXT: v_addc_co_u32_e32 v6, vcc, v13, v6, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v15, vcc +; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 +; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v7, s4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, s4, v6 +; GCN-NEXT: v_mul_lo_u32 v13, s5, v6 +; GCN-NEXT: v_mul_lo_u32 v6, s4, v6 +; GCN-NEXT: v_add_u32_e32 v7, v12, v7 +; GCN-NEXT: v_add_u32_e32 v7, v7, v13 +; GCN-NEXT: v_sub_u32_e32 v12, v9, v7 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v11, v6 +; GCN-NEXT: v_subb_co_u32_e64 v11, s[0:1], v12, v13, vcc +; GCN-NEXT: v_subrev_co_u32_e64 v12, s[0:1], s4, v6 +; GCN-NEXT: v_subbrev_co_u32_e64 v14, s[2:3], 0, v11, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v12 +; GCN-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_subrev_co_u32_e64 v13, s[0:1], s4, v12 +; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v7, vcc +; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; GCN-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v11, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v7 +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v6 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s5, v7 +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GCN-NEXT: v_cndmask_b32_e64 v11, v14, v11, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GCN-NEXT: v_xor_b32_e32 v6, v6, v10 +; GCN-NEXT: v_xor_b32_e32 v7, v7, v10 +; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v10 +; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v10, vcc ; GCN-NEXT: s_cbranch_execnz .LBB12_6 ; GCN-NEXT: .LBB12_5: -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v12 -; GCN-NEXT: v_sub_u32_e32 v11, 0, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 -; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 -; GCN-NEXT: v_add_u32_e32 v10, v10, v11 -; GCN-NEXT: v_mul_hi_u32 v10, v16, v10 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v12 -; GCN-NEXT: v_sub_u32_e32 v10, v16, v10 -; GCN-NEXT: v_sub_u32_e32 v11, v10, v12 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 -; GCN-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GCN-NEXT: v_sub_u32_e32 v11, v10, v12 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 -; GCN-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, s14 +; GCN-NEXT: s_sub_i32 s0, 0, s14 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-NEXT: v_mul_lo_u32 v7, s0, v6 +; GCN-NEXT: v_mul_hi_u32 v7, v6, v7 +; GCN-NEXT: v_add_u32_e32 v6, v6, v7 +; GCN-NEXT: v_mul_hi_u32 v6, v8, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v6, s14 +; GCN-NEXT: v_sub_u32_e32 v6, v8, v6 +; GCN-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GCN-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: .LBB12_6: ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v5, v1 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-NEXT: v_or_b32_e32 v9, s13, v1 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GCN-NEXT: s_cbranch_vccz .LBB12_15 ; GCN-NEXT: ; %bb.7: -; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v1 -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v0, v13 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v13, vcc -; GCN-NEXT: v_xor_b32_e32 v12, v12, v13 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v13 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v12 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v1 -; GCN-NEXT: v_sub_co_u32_e32 v15, vcc, 0, v12 -; GCN-NEXT: v_subb_co_u32_e32 v16, vcc, 0, v1, vcc -; GCN-NEXT: v_madmk_f32 v13, v14, 0x4f800000, v13 -; GCN-NEXT: v_rcp_f32_e32 v13, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 0x5f7ffffc, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 0x2f800000, v13 -; GCN-NEXT: v_trunc_f32_e32 v14, v14 -; GCN-NEXT: v_madmk_f32 v13, v14, 0xcf800000, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_mul_lo_u32 v18, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v17, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v19, v16, v13 -; GCN-NEXT: v_mul_lo_u32 v20, v15, v13 -; GCN-NEXT: v_add_u32_e32 v17, v17, v18 -; GCN-NEXT: v_add_u32_e32 v17, v17, v19 -; GCN-NEXT: v_mul_lo_u32 v18, v13, v17 -; GCN-NEXT: v_mul_hi_u32 v19, v13, v20 -; GCN-NEXT: v_mul_hi_u32 v21, v13, v17 -; GCN-NEXT: v_mul_hi_u32 v22, v14, v17 -; GCN-NEXT: v_mul_lo_u32 v17, v14, v17 -; GCN-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18 -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v21, vcc -; GCN-NEXT: v_mul_lo_u32 v21, v14, v20 -; GCN-NEXT: v_mul_hi_u32 v20, v14, v20 -; GCN-NEXT: v_add_co_u32_e32 v18, vcc, v18, v21 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, v19, v20, vcc -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v22, vcc -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v18, v17 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v17 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v18, vcc -; GCN-NEXT: v_mul_lo_u32 v17, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v18, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v13 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v13 -; GCN-NEXT: v_add_u32_e32 v17, v18, v17 -; GCN-NEXT: v_add_u32_e32 v16, v17, v16 -; GCN-NEXT: v_mul_lo_u32 v19, v13, v16 -; GCN-NEXT: v_mul_hi_u32 v20, v13, v15 -; GCN-NEXT: v_mul_hi_u32 v21, v13, v16 -; GCN-NEXT: v_mul_hi_u32 v18, v14, v15 -; GCN-NEXT: v_mul_lo_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v14, v16 -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v21, vcc -; GCN-NEXT: v_mul_lo_u32 v16, v14, v16 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v19, v15 -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, v20, v18, vcc -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v15, v16 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v17, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v15 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v16, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v4, v15 -; GCN-NEXT: v_xor_b32_e32 v16, v16, v15 -; GCN-NEXT: v_mul_lo_u32 v17, v16, v14 -; GCN-NEXT: v_mul_hi_u32 v18, v16, v13 -; GCN-NEXT: v_mul_hi_u32 v19, v16, v14 -; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v15, vcc -; GCN-NEXT: v_xor_b32_e32 v5, v5, v15 -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v18, v17 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v19, v5, v13 -; GCN-NEXT: v_mul_hi_u32 v13, v5, v13 -; GCN-NEXT: v_mul_hi_u32 v20, v5, v14 -; GCN-NEXT: v_mul_lo_u32 v14, v5, v14 -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v17, v19 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v18, v13, vcc -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v20, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v14 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v17, vcc -; GCN-NEXT: v_mul_lo_u32 v14, v12, v14 -; GCN-NEXT: v_mul_hi_u32 v17, v12, v13 -; GCN-NEXT: v_mul_lo_u32 v18, v1, v13 -; GCN-NEXT: v_mul_lo_u32 v13, v12, v13 -; GCN-NEXT: v_add_u32_e32 v14, v17, v14 -; GCN-NEXT: v_add_u32_e32 v14, v14, v18 -; GCN-NEXT: v_sub_u32_e32 v17, v5, v14 -; GCN-NEXT: v_sub_co_u32_e32 v13, vcc, v16, v13 -; GCN-NEXT: v_subb_co_u32_e64 v16, s[0:1], v17, v1, vcc -; GCN-NEXT: v_sub_co_u32_e64 v17, s[0:1], v13, v12 -; GCN-NEXT: v_subbrev_co_u32_e64 v18, s[2:3], 0, v16, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v1 -; GCN-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v17, v12 -; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v14, vcc -; GCN-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v1 -; GCN-NEXT: v_subb_co_u32_e64 v16, s[0:1], v16, v1, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; GCN-NEXT: v_cndmask_b32_e64 v19, v19, v20, s[2:3] -; GCN-NEXT: v_sub_co_u32_e64 v20, s[0:1], v17, v12 -; GCN-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v13, v12 -; GCN-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v16, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v19 -; GCN-NEXT: v_cndmask_b32_e32 v1, v14, v12, vcc -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v16, v18, v16, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GCN-NEXT: v_xor_b32_e32 v5, v5, v15 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v15 -; GCN-NEXT: v_sub_co_u32_e32 v12, vcc, v5, v15 -; GCN-NEXT: v_subb_co_u32_e32 v13, vcc, v1, v15, vcc +; GCN-NEXT: s_ashr_i32 s0, s13, 31 +; GCN-NEXT: s_add_u32 s2, s12, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_addc_u32 s3, s13, s0 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GCN-NEXT: v_cvt_f32_u32_e32 v8, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v9, s5 +; GCN-NEXT: s_sub_u32 s0, 0, s4 +; GCN-NEXT: s_subb_u32 s1, 0, s5 +; GCN-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 +; GCN-NEXT: v_rcp_f32_e32 v8, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GCN-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GCN-NEXT: v_trunc_f32_e32 v9, v9 +; GCN-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GCN-NEXT: v_mul_lo_u32 v10, s0, v9 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v8 +; GCN-NEXT: v_mul_lo_u32 v13, s1, v8 +; GCN-NEXT: v_mul_lo_u32 v12, s0, v8 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v8, v12 +; GCN-NEXT: v_mul_lo_u32 v13, v8, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v8, v10 +; GCN-NEXT: v_mul_lo_u32 v14, v9, v12 +; GCN-NEXT: v_mul_hi_u32 v12, v9, v12 +; GCN-NEXT: v_mul_hi_u32 v16, v9, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v15, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v9, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v16, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc +; GCN-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GCN-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v10, s0, v9 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v8 +; GCN-NEXT: v_mul_lo_u32 v12, s1, v8 +; GCN-NEXT: v_mul_lo_u32 v13, s0, v8 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v12 +; GCN-NEXT: v_mul_lo_u32 v14, v8, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v8, v13 +; GCN-NEXT: v_mul_hi_u32 v16, v8, v10 +; GCN-NEXT: v_mul_hi_u32 v12, v9, v13 +; GCN-NEXT: v_mul_lo_u32 v13, v9, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v9, v10 +; GCN-NEXT: v_add_co_u32_e32 v14, vcc, v15, v14 +; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v9, v10 +; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v14, v13 +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, v15, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GCN-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v0, v10 +; GCN-NEXT: v_xor_b32_e32 v11, v11, v10 +; GCN-NEXT: v_mul_lo_u32 v12, v11, v9 +; GCN-NEXT: v_mul_hi_u32 v13, v11, v8 +; GCN-NEXT: v_mul_hi_u32 v14, v11, v9 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v10, vcc +; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc +; GCN-NEXT: v_mul_lo_u32 v14, v1, v8 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v8 +; GCN-NEXT: v_mul_hi_u32 v15, v1, v9 +; GCN-NEXT: v_mul_lo_u32 v9, v1, v9 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 +; GCN-NEXT: v_addc_co_u32_e32 v8, vcc, v13, v8, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v15, vcc +; GCN-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 +; GCN-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v9, s4, v9 +; GCN-NEXT: v_mul_hi_u32 v12, s4, v8 +; GCN-NEXT: v_mul_lo_u32 v13, s5, v8 +; GCN-NEXT: v_mul_lo_u32 v8, s4, v8 +; GCN-NEXT: v_add_u32_e32 v9, v12, v9 +; GCN-NEXT: v_add_u32_e32 v9, v9, v13 +; GCN-NEXT: v_sub_u32_e32 v12, v1, v9 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_sub_co_u32_e32 v8, vcc, v11, v8 +; GCN-NEXT: v_subb_co_u32_e64 v11, s[0:1], v12, v13, vcc +; GCN-NEXT: v_subrev_co_u32_e64 v12, s[0:1], s4, v8 +; GCN-NEXT: v_subbrev_co_u32_e64 v14, s[2:3], 0, v11, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v12 +; GCN-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_subrev_co_u32_e64 v13, s[0:1], s4, v12 +; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; GCN-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v11, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v8 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GCN-NEXT: v_cndmask_b32_e64 v11, v14, v11, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GCN-NEXT: v_xor_b32_e32 v8, v8, v10 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 +; GCN-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v10 +; GCN-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v10, vcc ; GCN-NEXT: s_cbranch_execnz .LBB12_9 ; GCN-NEXT: .LBB12_8: -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_sub_u32_e32 v5, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GCN-NEXT: s_sub_i32 s0, 0, s12 +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v1 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_add_u32_e32 v1, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v1, v4, v1 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v0 -; GCN-NEXT: v_sub_u32_e32 v1, v4, v1 -; GCN-NEXT: v_sub_u32_e32 v4, v1, v0 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_sub_u32_e32 v4, v1, v0 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v8, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v8 +; GCN-NEXT: v_add_u32_e32 v1, v1, v8 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_mul_lo_u32 v1, v1, s12 +; GCN-NEXT: v_sub_u32_e32 v0, v0, v1 +; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc ; GCN-NEXT: .LBB12_9: -; GCN-NEXT: v_or_b32_e32 v1, v7, v3 +; GCN-NEXT: v_or_b32_e32 v1, s11, v3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN-NEXT: s_cbranch_vccz .LBB12_16 ; GCN-NEXT: ; %bb.10: -; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GCN-NEXT: v_add_co_u32_e32 v1, vcc, v2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_xor_b32_e32 v0, v3, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v0 -; GCN-NEXT: v_sub_co_u32_e32 v5, vcc, 0, v1 -; GCN-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v0, vcc -; GCN-NEXT: v_madmk_f32 v3, v4, 0x4f800000, v3 -; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_madmk_f32 v3, v4, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_lo_u32 v16, v5, v4 -; GCN-NEXT: v_mul_hi_u32 v15, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v17, v14, v3 -; GCN-NEXT: v_mul_lo_u32 v18, v5, v3 -; GCN-NEXT: v_add_u32_e32 v15, v15, v16 -; GCN-NEXT: v_add_u32_e32 v15, v15, v17 -; GCN-NEXT: v_mul_lo_u32 v16, v3, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v3, v18 -; GCN-NEXT: v_mul_hi_u32 v19, v3, v15 -; GCN-NEXT: v_mul_hi_u32 v20, v4, v15 -; GCN-NEXT: v_mul_lo_u32 v15, v4, v15 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v19, v4, v18 -; GCN-NEXT: v_mul_hi_u32 v18, v4, v18 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v16, v19 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, v17, v18, vcc -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v20, vcc -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v16, v15 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v17, vcc -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v3, v15 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v16, vcc -; GCN-NEXT: v_mul_lo_u32 v15, v5, v4 -; GCN-NEXT: v_mul_hi_u32 v16, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v3 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v3 -; GCN-NEXT: v_add_u32_e32 v15, v16, v15 -; GCN-NEXT: v_add_u32_e32 v14, v15, v14 -; GCN-NEXT: v_mul_lo_u32 v17, v3, v14 -; GCN-NEXT: v_mul_hi_u32 v18, v3, v5 -; GCN-NEXT: v_mul_hi_u32 v19, v3, v14 -; GCN-NEXT: v_mul_hi_u32 v16, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v4, v5 -; GCN-NEXT: v_mul_hi_u32 v15, v4, v14 -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v18, v17 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v14, v4, v14 -; GCN-NEXT: v_add_co_u32_e32 v5, vcc, v17, v5 -; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v18, v16, vcc -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_co_u32_e32 v5, vcc, v5, v14 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v15, vcc -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GCN-NEXT: v_add_co_u32_e32 v14, vcc, v6, v5 -; GCN-NEXT: v_xor_b32_e32 v14, v14, v5 -; GCN-NEXT: v_mul_lo_u32 v15, v14, v4 -; GCN-NEXT: v_mul_hi_u32 v16, v14, v3 -; GCN-NEXT: v_mul_hi_u32 v17, v14, v4 -; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v7, v7, v5 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v16, v15 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v17, vcc -; GCN-NEXT: v_mul_lo_u32 v17, v7, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v7, v3 -; GCN-NEXT: v_mul_hi_u32 v18, v7, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v7, v4 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v15, v17 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v16, v3, vcc -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v18, vcc -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v15, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v15, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v16, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_u32_e32 v4, v15, v4 -; GCN-NEXT: v_add_u32_e32 v4, v4, v16 -; GCN-NEXT: v_sub_u32_e32 v15, v7, v4 -; GCN-NEXT: v_sub_co_u32_e32 v3, vcc, v14, v3 -; GCN-NEXT: v_subb_co_u32_e64 v14, s[0:1], v15, v0, vcc -; GCN-NEXT: v_sub_co_u32_e64 v15, s[0:1], v3, v1 -; GCN-NEXT: v_subbrev_co_u32_e64 v16, s[2:3], 0, v14, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v16, v0 -; GCN-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v1 -; GCN-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], v16, v0 -; GCN-NEXT: v_subb_co_u32_e64 v14, s[0:1], v14, v0, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[2:3] -; GCN-NEXT: v_sub_co_u32_e64 v18, s[0:1], v15, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GCN-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v14, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 -; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v18, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v14, v16, v14, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v14, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v1, v5 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 -; GCN-NEXT: v_sub_co_u32_e32 v14, vcc, v1, v5 -; GCN-NEXT: v_subb_co_u32_e32 v15, vcc, v0, v5, vcc +; GCN-NEXT: s_ashr_i32 s0, s11, 31 +; GCN-NEXT: s_add_u32 s2, s10, s0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_addc_u32 s3, s11, s0 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: s_sub_u32 s0, 0, s4 +; GCN-NEXT: s_subb_u32 s1, 0, s5 +; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v10, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v13, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v12, s0, v0 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v0, v12 +; GCN-NEXT: v_mul_lo_u32 v13, v0, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v0, v10 +; GCN-NEXT: v_mul_lo_u32 v14, v1, v12 +; GCN-NEXT: v_mul_hi_u32 v12, v1, v12 +; GCN-NEXT: v_mul_hi_u32 v16, v1, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v15, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v1, v10 +; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v16, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v10 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v11, vcc +; GCN-NEXT: v_mul_lo_u32 v10, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v11, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v12, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v13, s0, v0 +; GCN-NEXT: v_add_u32_e32 v10, v11, v10 +; GCN-NEXT: v_add_u32_e32 v10, v10, v12 +; GCN-NEXT: v_mul_lo_u32 v14, v0, v10 +; GCN-NEXT: v_mul_hi_u32 v15, v0, v13 +; GCN-NEXT: v_mul_hi_u32 v16, v0, v10 +; GCN-NEXT: v_mul_hi_u32 v12, v1, v13 +; GCN-NEXT: v_mul_lo_u32 v13, v1, v13 +; GCN-NEXT: v_mul_hi_u32 v11, v1, v10 +; GCN-NEXT: v_add_co_u32_e32 v14, vcc, v15, v14 +; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v1, v10 +; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v14, v13 +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, v15, v12, vcc +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v10 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v11, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-NEXT: v_add_co_u32_e32 v10, vcc, v2, v11 +; GCN-NEXT: v_xor_b32_e32 v10, v10, v11 +; GCN-NEXT: v_mul_lo_u32 v12, v10, v1 +; GCN-NEXT: v_mul_hi_u32 v13, v10, v0 +; GCN-NEXT: v_mul_hi_u32 v14, v10, v1 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v3, v11 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc +; GCN-NEXT: v_mul_lo_u32 v14, v3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v3, v0 +; GCN-NEXT: v_mul_hi_u32 v15, v3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, v3, v1 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 +; GCN-NEXT: v_addc_co_u32_e32 v0, vcc, v13, v0, vcc +; GCN-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v15, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v1, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v12, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v13, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v0, s4, v0 +; GCN-NEXT: v_add_u32_e32 v1, v12, v1 +; GCN-NEXT: v_add_u32_e32 v1, v1, v13 +; GCN-NEXT: v_sub_u32_e32 v12, v3, v1 +; GCN-NEXT: v_mov_b32_e32 v13, s5 +; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v10, v0 +; GCN-NEXT: v_subb_co_u32_e64 v10, s[0:1], v12, v13, vcc +; GCN-NEXT: v_subrev_co_u32_e64 v12, s[0:1], s4, v0 +; GCN-NEXT: v_subbrev_co_u32_e64 v14, s[2:3], 0, v10, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v12 +; GCN-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v14 +; GCN-NEXT: v_subrev_co_u32_e64 v13, s[0:1], s4, v12 +; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; GCN-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v10, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v10, v14, v10, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GCN-NEXT: v_xor_b32_e32 v0, v0, v11 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 +; GCN-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v11 +; GCN-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v11, vcc ; GCN-NEXT: s_cbranch_execnz .LBB12_12 ; GCN-NEXT: .LBB12_11: -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v2 -; GCN-NEXT: v_sub_u32_e32 v1, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GCN-NEXT: s_sub_i32 s0, 0, s10 +; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, v6, v0 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 -; GCN-NEXT: v_sub_u32_e32 v0, v6, v0 -; GCN-NEXT: v_sub_u32_e32 v1, v0, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v0, v2, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 +; GCN-NEXT: v_sub_u32_e32 v0, v2, v0 +; GCN-NEXT: v_subrev_u32_e32 v1, s10, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_sub_u32_e32 v1, v0, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc +; GCN-NEXT: v_subrev_u32_e32 v1, s10, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 +; GCN-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc ; GCN-NEXT: .LBB12_12: ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[8:9] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB12_13: -; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NEXT: s_branch .LBB12_2 ; GCN-NEXT: .LBB12_14: ; GCN-NEXT: s_branch .LBB12_5 ; GCN-NEXT: .LBB12_15: -; GCN-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GCN-NEXT: s_branch .LBB12_8 ; GCN-NEXT: .LBB12_16: ; GCN-NEXT: s_branch .LBB12_11 @@ -6089,567 +6074,596 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 -; TONGA-NEXT: v_mov_b32_e32 v0, s6 -; TONGA-NEXT: s_addc_u32 s1, s7, 0 -; TONGA-NEXT: v_mov_b32_e32 v1, s7 -; TONGA-NEXT: s_add_u32 s2, s6, 32 -; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] -; TONGA-NEXT: s_addc_u32 s3, s7, 0 -; TONGA-NEXT: v_mov_b32_e32 v0, s2 -; TONGA-NEXT: v_mov_b32_e32 v1, s3 -; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] -; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v1, s1 -; TONGA-NEXT: s_add_u32 s0, s6, 16 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v5, s1 ; TONGA-NEXT: v_mov_b32_e32 v4, s0 +; TONGA-NEXT: s_add_u32 s0, s6, 32 +; TONGA-NEXT: s_addc_u32 s1, s7, 0 +; TONGA-NEXT: v_mov_b32_e32 v0, s6 +; TONGA-NEXT: v_mov_b32_e32 v9, s1 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: v_mov_b32_e32 v8, s0 +; TONGA-NEXT: s_add_u32 s0, s6, 16 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; TONGA-NEXT: s_waitcnt vmcnt(2) -; TONGA-NEXT: v_or_b32_e32 v9, v15, v11 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; TONGA-NEXT: s_cbranch_vccz .LBB12_13 -; TONGA-NEXT: ; %bb.1: -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v11 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v8 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v11, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v8 -; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8 -; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9 -; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc -; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11 -; TONGA-NEXT: v_rcp_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11 -; TONGA-NEXT: v_trunc_f32_e32 v18, v18 -; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18 -; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 -; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 -; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25 -; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11 -; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v23 -; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0 -; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22 -; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22 -; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 -; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11 -; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18 -; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc -; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9 -; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8 -; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9 -; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22 -; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_3 -; TONGA-NEXT: .LBB12_2: -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 -; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 -; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; TONGA-NEXT: v_mov_b32_e32 v9, 0 -; TONGA-NEXT: .LBB12_3: -; TONGA-NEXT: v_or_b32_e32 v11, v17, v13 -; TONGA-NEXT: v_mov_b32_e32 v10, 0 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; TONGA-NEXT: s_cbranch_vccz .LBB12_14 -; TONGA-NEXT: ; %bb.4: -; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v13 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v10 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v13, v10, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v11, v10 -; TONGA-NEXT: v_xor_b32_e32 v20, v13, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v15 -; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v20 -; TONGA-NEXT: v_sub_u32_e32 v21, vcc, 0, v15 -; TONGA-NEXT: v_subb_u32_e32 v22, vcc, 0, v20, vcc -; TONGA-NEXT: v_madmk_f32 v10, v11, 0x4f800000, v10 -; TONGA-NEXT: v_rcp_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; TONGA-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 -; TONGA-NEXT: v_trunc_f32_e32 v11, v11 -; TONGA-NEXT: v_madmk_f32 v10, v11, 0xcf800000, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v19, v10 -; TONGA-NEXT: v_mul_lo_u32 v13, v21, v18 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v21, v19, 0 -; TONGA-NEXT: v_mul_lo_u32 v14, v22, v19 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v13 -; TONGA-NEXT: v_add_u32_e32 v23, vcc, v11, v14 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v19, v23, 0 -; TONGA-NEXT: v_mul_hi_u32 v11, v19, v10 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v11, v13 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v10, 0 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v18, v23, 0 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v24, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v25, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v13 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v23, vcc, v19, v10 -; TONGA-NEXT: v_addc_u32_e32 v24, vcc, v18, v11, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v21, v23, 0 -; TONGA-NEXT: v_mul_lo_u32 v18, v21, v24 -; TONGA-NEXT: v_mul_lo_u32 v19, v22, v23 -; TONGA-NEXT: v_mul_hi_u32 v21, v23, v10 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v24, v10, 0 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v18, v11 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v24, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v21, v18 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v18, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v19, v14, vcc -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v13, v10 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v23, v10 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, v24, v11, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v16, v18 -; TONGA-NEXT: v_xor_b32_e32 v19, v10, v18 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v19, v14, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v19, v13 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v18, vcc -; TONGA-NEXT: v_xor_b32_e32 v17, v17, v18 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v10 -; TONGA-NEXT: v_addc_u32_e32 v22, vcc, 0, v11, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v13, 0 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v17, v14, 0 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v21, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v22, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v10, v13 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; TONGA-NEXT: v_mul_lo_u32 v14, v15, v10 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v15, v13, 0 -; TONGA-NEXT: v_mul_lo_u32 v13, v20, v13 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v14, v11 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v13, v11 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v17, v11 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v19, v10 -; TONGA-NEXT: v_subb_u32_e64 v13, s[0:1], v13, v20, vcc -; TONGA-NEXT: v_sub_u32_e64 v14, s[0:1], v10, v15 -; TONGA-NEXT: v_subbrev_u32_e64 v19, s[2:3], 0, v13, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v20 -; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v15 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v17, v11, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v19, v20 -; TONGA-NEXT: v_subb_u32_e64 v13, s[0:1], v13, v20, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v20 -; TONGA-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v22, s[0:1], v14, v15 -; TONGA-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v15 -; TONGA-NEXT: v_subbrev_u32_e64 v13, s[0:1], 0, v13, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v20 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 -; TONGA-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v22, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; TONGA-NEXT: v_cndmask_b32_e64 v13, v19, v13, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; TONGA-NEXT: v_xor_b32_e32 v10, v10, v18 -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v18 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v10, v18 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_6 -; TONGA-NEXT: .LBB12_5: -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v12 -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_lo_u32 v11, v11, v10 -; TONGA-NEXT: v_mul_hi_u32 v11, v10, v11 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; TONGA-NEXT: v_mul_hi_u32 v10, v16, v10 -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v12 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v16, v10 -; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v12, v10 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 -; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v12, v10 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 -; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; TONGA-NEXT: v_mov_b32_e32 v11, 0 -; TONGA-NEXT: .LBB12_6: +; TONGA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; TONGA-NEXT: s_addc_u32 s1, s7, 0 +; TONGA-NEXT: v_mov_b32_e32 v13, s1 +; TONGA-NEXT: v_mov_b32_e32 v12, s0 +; TONGA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; TONGA-NEXT: s_waitcnt vmcnt(3) +; TONGA-NEXT: v_readfirstlane_b32 s19, v1 +; TONGA-NEXT: v_readfirstlane_b32 s18, v0 +; TONGA-NEXT: s_waitcnt vmcnt(1) +; TONGA-NEXT: v_readfirstlane_b32 s21, v9 +; TONGA-NEXT: v_readfirstlane_b32 s20, v8 +; TONGA-NEXT: s_or_b64 s[0:1], s[18:19], s[20:21] +; TONGA-NEXT: s_mov_b32 s0, 0 +; TONGA-NEXT: v_readfirstlane_b32 s15, v3 +; TONGA-NEXT: v_readfirstlane_b32 s14, v2 +; TONGA-NEXT: v_readfirstlane_b32 s7, v7 +; TONGA-NEXT: v_readfirstlane_b32 s6, v6 +; TONGA-NEXT: v_readfirstlane_b32 s11, v5 +; TONGA-NEXT: v_readfirstlane_b32 s10, v4 +; TONGA-NEXT: v_readfirstlane_b32 s17, v11 +; TONGA-NEXT: v_readfirstlane_b32 s16, v10 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_or_b32_e32 v13, v5, v1 -; TONGA-NEXT: v_mov_b32_e32 v12, 0 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; TONGA-NEXT: s_cbranch_vccz .LBB12_15 -; TONGA-NEXT: ; %bb.7: -; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v0, v12 -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc -; TONGA-NEXT: v_xor_b32_e32 v18, v13, v12 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v12 -; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v18 -; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v1 -; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v18 -; TONGA-NEXT: v_subb_u32_e32 v20, vcc, 0, v1, vcc -; TONGA-NEXT: v_madmk_f32 v12, v13, 0x4f800000, v12 -; TONGA-NEXT: v_rcp_f32_e32 v12, v12 -; TONGA-NEXT: v_mul_f32_e32 v12, 0x5f7ffffc, v12 -; TONGA-NEXT: v_mul_f32_e32 v13, 0x2f800000, v12 -; TONGA-NEXT: v_trunc_f32_e32 v13, v13 -; TONGA-NEXT: v_madmk_f32 v12, v13, 0xcf800000, v12 -; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v13 -; TONGA-NEXT: v_cvt_u32_f32_e32 v17, v12 -; TONGA-NEXT: v_mul_lo_u32 v14, v19, v16 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v19, v17, 0 -; TONGA-NEXT: v_mul_lo_u32 v15, v20, v17 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v14 -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v13, v15 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v17, v15, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v17, v12 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v13 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v16, v12, 0 -; TONGA-NEXT: v_addc_u32_e32 v22, vcc, 0, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v16, v15, 0 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v21, v12 -; TONGA-NEXT: v_addc_u32_e32 v12, vcc, v22, v13, vcc -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v14 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v17, v12 -; TONGA-NEXT: v_addc_u32_e32 v22, vcc, v16, v13, vcc -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v19, v21, 0 -; TONGA-NEXT: v_mul_lo_u32 v16, v19, v22 -; TONGA-NEXT: v_mul_lo_u32 v17, v20, v21 -; TONGA-NEXT: v_mul_hi_u32 v19, v21, v12 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v22, v12, 0 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v16, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v17 -; TONGA-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v21, v13, 0 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v22, v13, 0 -; TONGA-NEXT: v_add_u32_e32 v16, vcc, v19, v16 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v16, v14 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, v17, v15, vcc -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v14, v12 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v21, v12 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v22, v13, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v16, 31, v5 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v4, v16 -; TONGA-NEXT: v_xor_b32_e32 v17, v12, v16 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v17, v15, 0 -; TONGA-NEXT: v_mul_hi_u32 v19, v17, v14 -; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v5, v16, vcc -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v16 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v12 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, 0, v13, vcc -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v5, v14, 0 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v5, v15, 0 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v19, v12 -; TONGA-NEXT: v_addc_u32_e32 v12, vcc, v20, v13, vcc -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v12, v14 -; TONGA-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc -; TONGA-NEXT: v_mul_lo_u32 v15, v18, v12 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v18, v14, 0 -; TONGA-NEXT: v_mul_lo_u32 v14, v1, v14 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v15, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v14, v13 -; TONGA-NEXT: v_sub_u32_e32 v14, vcc, v5, v13 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v17, v12 -; TONGA-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v1, vcc -; TONGA-NEXT: v_sub_u32_e64 v15, s[0:1], v12, v18 -; TONGA-NEXT: v_subbrev_u32_e64 v17, s[2:3], 0, v14, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v17, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v18 -; TONGA-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v17, v1 -; TONGA-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v1, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v19, v19, v20, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v15, v18 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc -; TONGA-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v14, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v19 -; TONGA-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v12, v18 -; TONGA-NEXT: v_cndmask_b32_e64 v14, v17, v14, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v17, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v15, v15, v20, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v5, v12, v15, vcc -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v16 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v5, v16 -; TONGA-NEXT: v_subb_u32_e32 v13, vcc, v1, v16, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_9 -; TONGA-NEXT: .LBB12_8: -; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 -; TONGA-NEXT: v_mov_b32_e32 v13, 0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 -; TONGA-NEXT: v_mul_lo_u32 v5, v5, v1 -; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_mul_hi_u32 v1, v4, v1 -; TONGA-NEXT: v_mul_lo_u32 v1, v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v0, v1 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v0, v1 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; TONGA-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc -; TONGA-NEXT: .LBB12_9: -; TONGA-NEXT: v_or_b32_e32 v1, v7, v3 -; TONGA-NEXT: v_mov_b32_e32 v0, 0 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; TONGA-NEXT: s_cbranch_vccz .LBB12_16 -; TONGA-NEXT: ; %bb.10: -; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v0 -; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc -; TONGA-NEXT: v_xor_b32_e32 v5, v1, v0 -; TONGA-NEXT: v_xor_b32_e32 v16, v3, v0 -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 -; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v16 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v5 -; TONGA-NEXT: v_subb_u32_e32 v18, vcc, 0, v16, vcc +; TONGA-NEXT: v_readfirstlane_b32 s9, v15 +; TONGA-NEXT: v_readfirstlane_b32 s8, v14 +; TONGA-NEXT: v_readfirstlane_b32 s13, v13 +; TONGA-NEXT: s_cmp_lg_u64 s[0:1], 0 +; TONGA-NEXT: v_readfirstlane_b32 s12, v12 +; TONGA-NEXT: s_cbranch_scc0 .LBB12_13 +; TONGA-NEXT: ; %bb.1: +; TONGA-NEXT: s_ashr_i32 s0, s21, 31 +; TONGA-NEXT: s_add_u32 s2, s20, s0 +; TONGA-NEXT: s_mov_b32 s1, s0 +; TONGA-NEXT: s_addc_u32 s3, s21, s0 +; TONGA-NEXT: s_xor_b64 s[22:23], s[2:3], s[0:1] +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s22 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s23 +; TONGA-NEXT: s_sub_u32 s2, 0, s22 +; TONGA-NEXT: s_subb_u32 s3, 0, s23 +; TONGA-NEXT: s_ashr_i32 s26, s19, 31 ; TONGA-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; TONGA-NEXT: v_rcp_f32_e32 v0, v0 +; TONGA-NEXT: s_mov_b32 s27, s26 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; TONGA-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; TONGA-NEXT: v_trunc_f32_e32 v1, v1 ; TONGA-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v15, v0 -; TONGA-NEXT: v_mul_lo_u32 v3, v17, v14 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v17, v15, 0 -; TONGA-NEXT: v_mul_lo_u32 v4, v18, v15 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v1, v4 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v15, v19, 0 -; TONGA-NEXT: v_mul_hi_u32 v1, v15, v0 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v1, v3 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v14, v0, 0 -; TONGA-NEXT: v_addc_u32_e32 v21, vcc, 0, v4, vcc -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v14, v19, 0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v20, v0 -; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v21, v1, vcc -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v0 +; TONGA-NEXT: v_mul_lo_u32 v2, s2, v4 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v5, 0 +; TONGA-NEXT: v_mul_lo_u32 v3, s3, v5 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; TONGA-NEXT: v_mul_hi_u32 v6, v5, v0 +; TONGA-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v0, 0 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v3, 0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v15, v0 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v14, v1, vcc -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v17, v19, 0 -; TONGA-NEXT: v_mul_lo_u32 v14, v17, v20 -; TONGA-NEXT: v_mul_lo_u32 v15, v18, v19 -; TONGA-NEXT: v_mul_hi_u32 v17, v19, v0 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v20, v0, 0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v14, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v15 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v19, v1, 0 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v20, v1, 0 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v17, v14 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v14, v3 -; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v15, v4, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v5, v0 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; TONGA-NEXT: v_mul_lo_u32 v4, s2, v7 +; TONGA-NEXT: v_mul_lo_u32 v5, s3, v6 +; TONGA-NEXT: v_mul_hi_u32 v8, v6, v0 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; TONGA-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v19, v0 -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v20, v1, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v15 -; TONGA-NEXT: v_xor_b32_e32 v14, v0, v15 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v14, v4, 0 -; TONGA-NEXT: v_mul_hi_u32 v17, v14, v3 -; TONGA-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc -; TONGA-NEXT: v_xor_b32_e32 v7, v7, v15 -; TONGA-NEXT: v_add_u32_e32 v17, vcc, v17, v0 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v3, 0 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v4, 0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v17, v0 -; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v18, v1, vcc -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v0, v3 +; TONGA-NEXT: s_add_u32 s0, s18, s26 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v0 +; TONGA-NEXT: s_addc_u32 s1, s19, s26 +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc +; TONGA-NEXT: s_xor_b64 s[28:29], s[0:1], s[26:27] +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s28, v3, 0 +; TONGA-NEXT: v_mul_hi_u32 v4, s28, v2 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s29, v2, 0 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s29, v3, 0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v5, v1, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; TONGA-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc -; TONGA-NEXT: v_mul_lo_u32 v4, v5, v0 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v5, v3, 0 -; TONGA-NEXT: v_mul_lo_u32 v3, v16, v3 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; TONGA-NEXT: v_mul_lo_u32 v3, s22, v0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s22, v2, 0 +; TONGA-NEXT: v_mul_lo_u32 v2, s23, v2 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v7, v1 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v14, v0 -; TONGA-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v16, vcc -; TONGA-NEXT: v_sub_u32_e64 v4, s[0:1], v0, v5 -; TONGA-NEXT: v_subbrev_u32_e64 v14, s[2:3], 0, v3, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v16 -; TONGA-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v4, v5 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v14, v16 -; TONGA-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v16, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v16 -; TONGA-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v18, s[0:1], v4, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; TONGA-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, s29, v1 +; TONGA-NEXT: v_mov_b32_e32 v3, s23 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s28, v0 +; TONGA-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc +; TONGA-NEXT: v_subrev_u32_e64 v4, s[0:1], s22, v0 +; TONGA-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s23, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s22, v4 +; TONGA-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], s23, v5 +; TONGA-NEXT: v_subrev_u32_e64 v3, s[0:1], s22, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] +; TONGA-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; TONGA-NEXT: v_mov_b32_e32 v4, s29 +; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s23, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s22, v0 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; TONGA-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v1, v16 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 -; TONGA-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v18, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v15 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v15 -; TONGA-NEXT: v_sub_u32_e32 v14, vcc, v0, v15 -; TONGA-NEXT: v_subb_u32_e32 v15, vcc, v1, v15, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_12 -; TONGA-NEXT: .LBB12_11: -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 -; TONGA-NEXT: v_mov_b32_e32 v15, 0 +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, s23, v1 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, s26, v0 +; TONGA-NEXT: v_xor_b32_e32 v1, s26, v1 +; TONGA-NEXT: v_mov_b32_e32 v2, s26 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s26, v0 +; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_3 +; TONGA-NEXT: .LBB12_2: +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s20 +; TONGA-NEXT: s_sub_i32 s0, 0, s20 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 -; TONGA-NEXT: v_mul_lo_u32 v1, v1, v0 +; TONGA-NEXT: v_mul_lo_u32 v1, s0, v0 ; TONGA-NEXT: v_mul_hi_u32 v1, v0, v1 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; TONGA-NEXT: v_mul_hi_u32 v0, v6, v0 -; TONGA-NEXT: v_mul_lo_u32 v0, v0, v2 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v6, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; TONGA-NEXT: v_mul_hi_u32 v0, s18, v0 +; TONGA-NEXT: v_mul_lo_u32 v0, v0, s20 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s18, v0 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s20, v0 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s20, v0 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s20, v0 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s20, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; TONGA-NEXT: v_mov_b32_e32 v1, 0 +; TONGA-NEXT: .LBB12_3: +; TONGA-NEXT: s_or_b64 s[0:1], s[14:15], s[16:17] +; TONGA-NEXT: s_mov_b32 s0, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[0:1], 0 +; TONGA-NEXT: s_cbranch_scc0 .LBB12_14 +; TONGA-NEXT: ; %bb.4: +; TONGA-NEXT: s_ashr_i32 s0, s17, 31 +; TONGA-NEXT: s_add_u32 s2, s16, s0 +; TONGA-NEXT: s_mov_b32 s1, s0 +; TONGA-NEXT: s_addc_u32 s3, s17, s0 +; TONGA-NEXT: s_xor_b64 s[18:19], s[2:3], s[0:1] +; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s18 +; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s19 +; TONGA-NEXT: s_sub_u32 s2, 0, s18 +; TONGA-NEXT: s_subb_u32 s3, 0, s19 +; TONGA-NEXT: s_ashr_i32 s22, s15, 31 +; TONGA-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2 +; TONGA-NEXT: v_rcp_f32_e32 v2, v2 +; TONGA-NEXT: s_mov_b32 s23, s22 +; TONGA-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; TONGA-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; TONGA-NEXT: v_trunc_f32_e32 v3, v3 +; TONGA-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2 +; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v3 +; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v2 +; TONGA-NEXT: v_mul_lo_u32 v4, s2, v6 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, 0 +; TONGA-NEXT: v_mul_lo_u32 v5, s3, v7 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v3, v5 +; TONGA-NEXT: v_mul_hi_u32 v8, v7, v2 +; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v5, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v2, 0 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v5, 0 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; TONGA-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v7, v2 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, v6, v3, vcc +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 +; TONGA-NEXT: v_mul_lo_u32 v6, s2, v9 +; TONGA-NEXT: v_mul_lo_u32 v7, s3, v8 +; TONGA-NEXT: v_mul_hi_u32 v10, v8, v2 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v9, v2, 0 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v3, 0 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v3, 0 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v10, v6 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; TONGA-NEXT: s_add_u32 s0, s14, s22 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v2 +; TONGA-NEXT: s_addc_u32 s1, s15, s22 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v9, v3, vcc +; TONGA-NEXT: s_xor_b64 s[24:25], s[0:1], s[22:23] +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s24, v5, 0 +; TONGA-NEXT: v_mul_hi_u32 v6, s24, v4 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s25, v4, 0 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s25, v5, 0 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; TONGA-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v2, v4 +; TONGA-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; TONGA-NEXT: v_mul_lo_u32 v5, s18, v2 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v4, 0 +; TONGA-NEXT: v_mul_lo_u32 v4, s19, v4 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s25, v3 +; TONGA-NEXT: v_mov_b32_e32 v5, s19 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, s24, v2 +; TONGA-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc +; TONGA-NEXT: v_subrev_u32_e64 v6, s[0:1], s18, v2 +; TONGA-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s19, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s18, v6 +; TONGA-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], s19, v7 +; TONGA-NEXT: v_subrev_u32_e64 v5, s[0:1], s18, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] +; TONGA-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] +; TONGA-NEXT: v_mov_b32_e32 v6, s25 +; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s19, v3 +; TONGA-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s18, v2 +; TONGA-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, s19, v3 +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; TONGA-NEXT: v_xor_b32_e32 v2, s22, v2 +; TONGA-NEXT: v_xor_b32_e32 v3, s22, v3 +; TONGA-NEXT: v_mov_b32_e32 v4, s22 +; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s22, v2 +; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_6 +; TONGA-NEXT: .LBB12_5: +; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s16 +; TONGA-NEXT: s_sub_i32 s0, 0, s16 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 +; TONGA-NEXT: v_mul_lo_u32 v3, s0, v2 +; TONGA-NEXT: v_mul_hi_u32 v3, v2, v3 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; TONGA-NEXT: v_mul_hi_u32 v2, s14, v2 +; TONGA-NEXT: v_mul_lo_u32 v2, v2, s16 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, s14, v2 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s16, v2 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s16, v2 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; TONGA-NEXT: v_mov_b32_e32 v3, 0 +; TONGA-NEXT: .LBB12_6: +; TONGA-NEXT: s_or_b64 s[0:1], s[12:13], s[10:11] +; TONGA-NEXT: s_mov_b32 s0, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[0:1], 0 +; TONGA-NEXT: s_cbranch_scc0 .LBB12_15 +; TONGA-NEXT: ; %bb.7: +; TONGA-NEXT: s_ashr_i32 s0, s11, 31 +; TONGA-NEXT: s_add_u32 s2, s10, s0 +; TONGA-NEXT: s_mov_b32 s1, s0 +; TONGA-NEXT: s_addc_u32 s3, s11, s0 +; TONGA-NEXT: s_xor_b64 s[14:15], s[2:3], s[0:1] +; TONGA-NEXT: v_cvt_f32_u32_e32 v4, s14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v5, s15 +; TONGA-NEXT: s_sub_u32 s2, 0, s14 +; TONGA-NEXT: s_subb_u32 s3, 0, s15 +; TONGA-NEXT: s_ashr_i32 s18, s13, 31 +; TONGA-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; TONGA-NEXT: v_rcp_f32_e32 v4, v4 +; TONGA-NEXT: s_mov_b32 s19, s18 +; TONGA-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; TONGA-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; TONGA-NEXT: v_trunc_f32_e32 v5, v5 +; TONGA-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v5 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v4 +; TONGA-NEXT: v_mul_lo_u32 v6, s2, v8 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v9, 0 +; TONGA-NEXT: v_mul_lo_u32 v7, s3, v9 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v6 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v5, v7 +; TONGA-NEXT: v_mul_hi_u32 v10, v9, v4 +; TONGA-NEXT: v_mad_u64_u32 v[5:6], s[0:1], v9, v7, 0 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v5 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v8, v4, 0 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v6, vcc +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v7, 0 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v10, v4 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v11, v5, vcc +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v9, v4 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v8, v5, vcc +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v10, 0 +; TONGA-NEXT: v_mul_lo_u32 v8, s2, v11 +; TONGA-NEXT: v_mul_lo_u32 v9, s3, v10 +; TONGA-NEXT: v_mul_hi_u32 v12, v10, v4 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v11, v4, 0 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v8, v5 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v9 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v5, 0 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v11, v5, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v12, v8 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; TONGA-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; TONGA-NEXT: s_add_u32 s0, s12, s18 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v10, v4 +; TONGA-NEXT: s_addc_u32 s1, s13, s18 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, v11, v5, vcc +; TONGA-NEXT: s_xor_b64 s[20:21], s[0:1], s[18:19] +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v7, 0 +; TONGA-NEXT: v_mul_hi_u32 v8, s20, v6 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v4 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s21, v6, 0 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s21, v7, 0 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v4, v6 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc +; TONGA-NEXT: v_mul_lo_u32 v7, s14, v4 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s14, v6, 0 +; TONGA-NEXT: v_mul_lo_u32 v6, s15, v6 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v5 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s21, v5 +; TONGA-NEXT: v_mov_b32_e32 v7, s15 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s20, v4 +; TONGA-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc +; TONGA-NEXT: v_subrev_u32_e64 v8, s[0:1], s14, v4 +; TONGA-NEXT: v_subbrev_u32_e64 v9, s[2:3], 0, v6, s[0:1] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s15, v9 +; TONGA-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v8 +; TONGA-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], s15, v9 +; TONGA-NEXT: v_subrev_u32_e64 v7, s[0:1], s14, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[2:3] +; TONGA-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 +; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; TONGA-NEXT: v_mov_b32_e32 v8, s21 +; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v8, v5, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, s15, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; TONGA-NEXT: v_xor_b32_e32 v4, s18, v4 +; TONGA-NEXT: v_xor_b32_e32 v5, s18, v5 +; TONGA-NEXT: v_mov_b32_e32 v6, s18 +; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, s18, v4 +; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_9 +; TONGA-NEXT: .LBB12_8: +; TONGA-NEXT: v_cvt_f32_u32_e32 v4, s10 +; TONGA-NEXT: s_sub_i32 s0, 0, s10 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; TONGA-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v4 +; TONGA-NEXT: v_mul_lo_u32 v5, s0, v4 +; TONGA-NEXT: v_mul_hi_u32 v5, v4, v5 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; TONGA-NEXT: v_mul_hi_u32 v4, s12, v4 +; TONGA-NEXT: v_mul_lo_u32 v4, v4, s10 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s12, v4 +; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s10, v4 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s10, v4 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; TONGA-NEXT: v_mov_b32_e32 v5, 0 +; TONGA-NEXT: .LBB12_9: +; TONGA-NEXT: s_or_b64 s[0:1], s[8:9], s[6:7] +; TONGA-NEXT: s_mov_b32 s0, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[0:1], 0 +; TONGA-NEXT: s_cbranch_scc0 .LBB12_16 +; TONGA-NEXT: ; %bb.10: +; TONGA-NEXT: s_ashr_i32 s0, s7, 31 +; TONGA-NEXT: s_add_u32 s2, s6, s0 +; TONGA-NEXT: s_mov_b32 s1, s0 +; TONGA-NEXT: s_addc_u32 s3, s7, s0 +; TONGA-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] +; TONGA-NEXT: v_cvt_f32_u32_e32 v6, s10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v7, s11 +; TONGA-NEXT: s_sub_u32 s2, 0, s10 +; TONGA-NEXT: s_subb_u32 s3, 0, s11 +; TONGA-NEXT: s_ashr_i32 s14, s9, 31 +; TONGA-NEXT: v_madmk_f32 v6, v7, 0x4f800000, v6 +; TONGA-NEXT: v_rcp_f32_e32 v6, v6 +; TONGA-NEXT: s_mov_b32 s15, s14 +; TONGA-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; TONGA-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; TONGA-NEXT: v_trunc_f32_e32 v7, v7 +; TONGA-NEXT: v_madmk_f32 v6, v7, 0xcf800000, v6 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v7 +; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v6 +; TONGA-NEXT: v_mul_lo_u32 v8, s2, v10 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v9, s3, v11 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v7, v9 +; TONGA-NEXT: v_mul_hi_u32 v12, v11, v6 +; TONGA-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v11, v9, 0 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v7 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v6, 0 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v8, vcc +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v9, 0 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v12, v6 +; TONGA-NEXT: v_addc_u32_e32 v6, vcc, v13, v7, vcc +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v11, v6 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v10, v7, vcc +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v12, 0 +; TONGA-NEXT: v_mul_lo_u32 v10, s2, v13 +; TONGA-NEXT: v_mul_lo_u32 v11, s3, v12 +; TONGA-NEXT: v_mul_hi_u32 v14, v12, v6 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v13, v6, 0 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v10, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v11 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v7, 0 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v13, v7, 0 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v14, v10 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v11, v9, vcc +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; TONGA-NEXT: s_add_u32 s0, s8, s14 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v12, v6 +; TONGA-NEXT: s_addc_u32 s1, s9, s14 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, v13, v7, vcc +; TONGA-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s16, v9, 0 +; TONGA-NEXT: v_mul_hi_u32 v10, s16, v8 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v6 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s17, v8, 0 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s17, v9, 0 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v10, v6 +; TONGA-NEXT: v_addc_u32_e32 v6, vcc, v11, v7, vcc +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v6, v8 +; TONGA-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc +; TONGA-NEXT: v_mul_lo_u32 v9, s10, v6 +; TONGA-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s10, v8, 0 +; TONGA-NEXT: v_mul_lo_u32 v8, s11, v8 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v8, v7 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s17, v7 +; TONGA-NEXT: v_mov_b32_e32 v9, s11 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s16, v6 +; TONGA-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v9, vcc +; TONGA-NEXT: v_subrev_u32_e64 v10, s[0:1], s10, v6 +; TONGA-NEXT: v_subbrev_u32_e64 v11, s[2:3], 0, v8, s[0:1] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v10 +; TONGA-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v9, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v11 +; TONGA-NEXT: v_subrev_u32_e64 v9, s[0:1], s10, v10 +; TONGA-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[2:3] +; TONGA-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v8, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; TONGA-NEXT: v_cndmask_b32_e64 v9, v10, v9, s[0:1] +; TONGA-NEXT: v_mov_b32_e32 v10, s17 +; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s11, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, s11, v7 +; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; TONGA-NEXT: v_xor_b32_e32 v6, s14, v6 +; TONGA-NEXT: v_xor_b32_e32 v7, s14, v7 +; TONGA-NEXT: v_mov_b32_e32 v8, s14 +; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s14, v6 +; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_12 +; TONGA-NEXT: .LBB12_11: +; TONGA-NEXT: v_cvt_f32_u32_e32 v6, s6 +; TONGA-NEXT: s_sub_i32 s0, 0, s6 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6 +; TONGA-NEXT: v_mul_lo_u32 v7, s0, v6 +; TONGA-NEXT: v_mul_hi_u32 v7, v6, v7 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; TONGA-NEXT: v_mul_hi_u32 v6, s8, v6 +; TONGA-NEXT: v_mul_lo_u32 v6, v6, s6 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s8, v6 +; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s6, v6 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s6, v6 +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s6, v6 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s6, v6 +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; TONGA-NEXT: v_mov_b32_e32 v7, 0 ; TONGA-NEXT: .LBB12_12: -; TONGA-NEXT: v_mov_b32_e32 v0, s4 -; TONGA-NEXT: v_mov_b32_e32 v1, s5 +; TONGA-NEXT: v_mov_b32_e32 v9, s5 +; TONGA-NEXT: v_mov_b32_e32 v8, s4 ; TONGA-NEXT: s_add_u32 s0, s4, 16 -; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; TONGA-NEXT: s_addc_u32 s1, s5, 0 ; TONGA-NEXT: v_mov_b32_e32 v0, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 -; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; TONGA-NEXT: s_endpgm ; TONGA-NEXT: .LBB12_13: -; TONGA-NEXT: ; implicit-def: $vgpr8_vgpr9 +; TONGA-NEXT: ; implicit-def: $vgpr0_vgpr1 ; TONGA-NEXT: s_branch .LBB12_2 ; TONGA-NEXT: .LBB12_14: ; TONGA-NEXT: s_branch .LBB12_5 ; TONGA-NEXT: .LBB12_15: -; TONGA-NEXT: ; implicit-def: $vgpr12_vgpr13 +; TONGA-NEXT: ; implicit-def: $vgpr4_vgpr5 ; TONGA-NEXT: s_branch .LBB12_8 ; TONGA-NEXT: .LBB12_16: ; TONGA-NEXT: s_branch .LBB12_11 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c729c3fb8a4e4..47dfa9f4fc2d3 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -380,12 +380,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v7, v12 +; GCN-IR-NEXT: v_not_b32_e32 v6, v12 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8 -; GCN-IR-NEXT: v_not_b32_e32 v6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v13 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 5acbb044c1057..e9017939f8a4a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -348,10 +348,9 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v10 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v14 -; GCN-IR-NEXT: v_not_b32_e32 v1, 0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], -1, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index d0d1ba82dc000..fe237dfa9cb52 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -53,7 +53,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 ; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] -; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] ; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59] @@ -72,7 +71,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[44:45] ; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] ; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll index 0fc655af37fa1..473d996bf721d 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -31,5 +31,5 @@ define amdgpu_kernel void @kernel1() #1 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll index b5c14c56b0aae..c9ee40cec878a 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll @@ -97,8 +97,8 @@ define amdgpu_kernel void @kernel2() #0 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll index 6464ff3a50dff..308f8b595eb06 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 { attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll index ea3c0a33c2da0..7e2b085f5a879 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel2() #2 { attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll index 3823d17658fd5..3d6454cc9f99b 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -52,8 +52,8 @@ attributes #0 = { nounwind } attributes #1 = { "uniform-work-group-size"="false" } attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { nounwind "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll index e4783597b5d00..3032d8ddf0a53 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 { attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll index 1c054fb97ee19..e315e0454f424 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -61,5 +61,5 @@ define amdgpu_kernel void @kernel3() #0 { attributes #0 = { "uniform-work-group-size"="false" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir new file mode 100644 index 0000000000000..8b467eb0b054e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -0,0 +1,1173 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX950 %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX942 %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90a %s + +--- +name: test_pk_mul_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5'} + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_hi_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5'} + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_pk_add_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_pk_add_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX950-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX950-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX950-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX950-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX950-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_pk_add_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX942-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX942-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX942-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX942-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_pk_add_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX90a-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX90a-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX90a-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX90a-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX90a-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + + +... +--- +name: test_pk_fma_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_unpacking_does_not_introduce_rw_dependency +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_opcodes_not_supported_for_unpacking_are_skipped +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + + $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + + $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + + S_ENDPGM 0 + +... +--- +name: test_opsel_register_is_correctly_marked_as_killed +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_inst_dependent_on_mfma_are_not_unpacked +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX950-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX950-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX950-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90a-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_mfma_def_using_instr_blocks_unpacking +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX950-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_unpacking_with_imm_input +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_unpacking_with_imm_input + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_unpacking_with_imm_input + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_unpacking_with_imm_input + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_neg_lo_hi_post_unpacking +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 94f1b83ea2765..6480a88d40f5a 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -355,12 +355,11 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v7, v12 +; GCN-IR-NEXT: v_not_b32_e32 v6, v12 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8 -; GCN-IR-NEXT: v_not_b32_e32 v6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v13 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll new file mode 100644 index 0000000000000..138d437762488 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 < %s | FileCheck %s + +declare void @llvm.amdgcn.sched.barrier(i32 %mask) +declare void @llvm.amdgcn.load.to.lds(ptr %in, ptr addrspace(3) %lds_out, i32 %size, i32 %offset, i32 %aux) + +define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr addrspace(3) %lds_buffer1, ptr addrspace(3) %lds_buffer2) #0 { +; This test checks if SIInsertWaitcnts pass inserts S_WAITCNT VMCNT(0) before DS_READ +; CHECK-LABEL: test_waitcnt: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_u32 s4, s0, 64 +; CHECK-NEXT: s_addc_u32 s5, s1, 0 +; CHECK-NEXT: s_mov_b32 m0, s2 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_lds_dword v0, s[4:5] offset:4 +; CHECK-NEXT: s_load_dword s4, s[0:1], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] offset:64 +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: ds_write_b32 v2, v1 +; CHECK-NEXT: ds_write_b32 v3, v1 +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v2, s[0:1] offset:16 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] offset:32 +; CHECK-NEXT: s_endpgm +entry: + ; VMEM accesses with alias.scope + %vmem_load = load i32, ptr addrspace(1) %global_buffer + %gepvmem = getelementptr i32, ptr addrspace(1) %global_buffer, i32 16 + store i32 %vmem_load, ptr addrspace(1) %gepvmem, align 4, !alias.scope !0 + + ; Global to LDS load + %gepvmem.ascast = addrspacecast ptr addrspace(1) %gepvmem to ptr + call void @llvm.amdgcn.load.to.lds(ptr %gepvmem.ascast, ptr addrspace(3) %lds_buffer1, i32 4, i32 4, i32 0), !alias.scope !9, !noalias !14 + + ; Insert scheduling barrier + call void @llvm.amdgcn.sched.barrier(i32 0) + + ; DS_WRITEs with alias.scope and noalias + store i32 %vmem_load, ptr addrspace(3) %lds_buffer1, align 4, !alias.scope !1, !noalias !12 + store i32 %vmem_load, ptr addrspace(3) %lds_buffer2, align 4, !alias.scope !6, !noalias !13 + + ; Insert scheduling barrier + call void @llvm.amdgcn.sched.barrier(i32 0) + + ; DS_READ with alias.scope missing + %lds_load = load i32, ptr addrspace(3) %lds_buffer1, align 4, !noalias !12 + + ; VMEM write + %gep = getelementptr i32, ptr addrspace(1) %global_buffer, i32 4 + %gep2 = getelementptr i32, ptr addrspace(1) %global_buffer, i32 8 + store i32 %lds_load, ptr addrspace(1) %gep, align 4, !alias.scope !0 + store i32 %vmem_load, ptr addrspace(1) %gep2, align 4, !alias.scope !0 + + ret void +} + +; VMEM alias domain and scope +!5 = !{!"vmem.domain"} +!4 = !{!"vmem.scope", !5} +!0 = !{!4} + +; LDS alias domains and scopes +!3 = !{!"lds1.domain"} +!2 = !{!"lds1.scope", !3} +!1 = !{!2} + +!8 = !{!"lds2.domain"} +!7 = !{!"lds2.scope", !8} +!6 = !{!7} + +!11 = !{!"lds1_off4.domain"} +!10 = !{!"lds1_off4.scope", !11} +!9 = !{!10} + +; Noalias lists +!12 = !{!7, !10} +!13 = !{!2, !10} +!14 = !{!2, !7} + +attributes #0 = { nounwind } diff --git a/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir b/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir index 31bdb1194880a..b32094fcb93da 100644 --- a/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir +++ b/llvm/test/DebugInfo/AMDGPU/live-debug-values-spill-tracking.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass livedebugvalues %s -o - -debug-only livedebugvalues 2>&1 | FileCheck %s +# REQUIRES: asserts + # Verify that spill tracking is disabled on amdgcn. # CHECK: Disabling InstrRefBasedLDV spill tracking for kern since target has too many potential stack slot indexes diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index 6bb0f4b1dff2d..3d6af6ba6dbf8 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -3628,6 +3628,18 @@ v_alignbit_b32 v5, v1, v2, exec_lo v_alignbit_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01] +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] + v_alignbyte_b32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] @@ -3715,6 +3727,18 @@ v_alignbyte_b32 v5, v1, v2, exec_lo v_alignbyte_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01] +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] + v_mullit_f32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s index 9dcbd4a4074af..7b6b241e04707 100644 --- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s @@ -44,3 +44,16 @@ s_load_dword s5, s[2:3], glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions // CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc // CHECK-NEXT:{{^}} ^ + +//============================================================================== +// not a valid operand + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s index 1e8457d54049a..a475c739e690d 100644 --- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s @@ -49,3 +49,13 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0 // CHECK-NEXT:{{^}} ^ + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s index f3f4cae22538a..a1cd9ce8ef18e 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s @@ -2829,6 +2829,18 @@ v_alignbit_b32 v5, v1, v2, src_execz v_alignbit_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03] +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] +// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] + v_alignbyte_b32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] @@ -3000,6 +3012,18 @@ v_alignbyte_b32 v5, v1, v2, src_execz v_alignbyte_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03] +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] + +v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] +// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] + v_min3_f32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s index e700b0b3cabfe..5c9dbd7f7636f 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950-err.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s @@ -156,3 +156,51 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:9], v[0:3] v20, v21 blgp v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:4 // CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4 + + +// Workaround a hardware bug to disallow sgpr/inline constants as scale operands + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 +// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 9 +// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, 4.0 +// CHECK: :[[@LINE-1]]:77: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, v24 +// CHECK: :[[@LINE-1]]:72: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 +// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 +// CHECK: :[[@LINE-1]]:78: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, v24 +// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, v24 +// CHECK: :[[@LINE-1]]:73: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index 2d3a56703674a..c9035033912ac 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -405,58 +405,6 @@ v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], a[4:11], v[12:19], v[20:23], v24, v25 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 - -// GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 - // GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0b,0xad,0xd3,0x04,0x19,0x52,0x24] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[20:23], v24, v25 cbsz:3 blgp:1 @@ -585,22 +533,6 @@ v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:29], v[32:47], v48, v49 // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:21], v[24:29], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,1,0] cbsz:2 blgp:3 -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 - -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 - -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 - -// GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU -v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 - // op_sel combinations // GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, v49 op_sel:[0,1,0] op_sel_hi:[0,0,0] ; encoding: [0x00,0x10,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 721babdd64245..08ed50d92ba83 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1146,6 +1146,18 @@ # GFX10: v_alignbit_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04 +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04 + # GFX10: v_alignbyte_b32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] 0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04 @@ -1233,6 +1245,18 @@ # GFX10: v_alignbyte_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04 +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04 + # GFX10: v_and_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt index 77b87ac63f335..e191455beb64d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt @@ -392,27 +392,6 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:17], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x64 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 33, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0xa1,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 9, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x89,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], m0, m0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x7c,0xf8,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s20, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x14,0x12,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x18,0x30,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], s24, v44 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x18,0x58,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 9 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x02,0x13,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 @@ -422,15 +401,6 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] blgp:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x24 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:1 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x09,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v44, s24 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x2c,0x31,0x00,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], vcc_lo, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x6a,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:15], v[20:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x00,0x0c,0xad,0xd3,0x04,0x19,0x52,0x84 @@ -467,18 +437,6 @@ # GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[50:53], v[4:11], v[12:19], v[20:23], v24, v25 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] 0x00,0x00,0xac,0xd3,0x18,0x33,0x02,0x00,0x32,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 4.0, v2 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0xf6,0x04,0x02,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, 4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0x02,0xed,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], -4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0xf7,0xe4,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - -# GFX950: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04] -0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xad,0xd3,0x04,0x19,0x52,0x04 - # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:19], a[24:27], a[32:47], v48, v49 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x00,0x8c,0xae,0xd3,0x10,0x31,0x82,0x9c @@ -581,18 +539,6 @@ # GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[50:65], v[16:23], v[24:31], v[32:47], v48, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] 0x00,0x00,0xac,0xd3,0x30,0x63,0x02,0x00,0x32,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 -# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 16, v49 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -0x00,0x00,0xac,0xd3,0x90,0x62,0x02,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 - -# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], v48, -4.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -0x00,0x00,0xac,0xd3,0x30,0xef,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 - -# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 4.0, 1.0 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -0x00,0x00,0xac,0xd3,0xf6,0xe4,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 - -# GFX950: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[32:47], 0.15915494, -16 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04] -0x00,0x00,0xac,0xd3,0xf8,0xa0,0x01,0x00,0x00,0x08,0xae,0xd3,0x10,0x31,0x82,0x04 - # GFX950: v_mfma_i32_16x16x64_i8 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x80,0xb6,0xd3,0x00,0x01,0x02,0x1c diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt index 618e081525414..802d6368507e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt @@ -11310,6 +11310,18 @@ # CHECK: v_alignbit_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01 +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04 + # CHECK: v_alignbyte_b32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04 @@ -11406,6 +11418,18 @@ # CHECK: v_alignbyte_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01 +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04 + +# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] +0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04 + # CHECK: v_min3_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04 diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll new file mode 100644 index 0000000000000..05d2330fffc7f --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=sroa,instcombine,aggressive-instcombine %s -S -o - | FileCheck %s + +define i64 @quux(ptr %arg) { +; CHECK-LABEL: define i64 @quux( +; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[ARG]], align 1 +; CHECK-NEXT: ret i64 [[LOAD]] +; +bb: + %load = load i8, ptr %arg, align 1 + %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 1 + %load1 = load i8, ptr %getelementptr, align 1 + %getelementptr2 = getelementptr inbounds nuw i8, ptr %arg, i64 2 + %load3 = load i8, ptr %getelementptr2, align 1 + %getelementptr4 = getelementptr inbounds nuw i8, ptr %arg, i64 3 + %load5 = load i8, ptr %getelementptr4, align 1 + %getelementptr6 = getelementptr inbounds nuw i8, ptr %arg, i64 4 + %load7 = load i8, ptr %getelementptr6, align 1 + %getelementptr8 = getelementptr inbounds nuw i8, ptr %arg, i64 5 + %load9 = load i8, ptr %getelementptr8, align 1 + %getelementptr10 = getelementptr inbounds nuw i8, ptr %arg, i64 6 + %load11 = load i8, ptr %getelementptr10, align 1 + %getelementptr12 = getelementptr inbounds nuw i8, ptr %arg, i64 7 + %load13 = load i8, ptr %getelementptr12, align 1 + %zext = zext i8 %load13 to i64 + %shl = shl nuw i64 %zext, 56 + %zext14 = zext i8 %load11 to i64 + %shl15 = shl nuw nsw i64 %zext14, 48 + %or = or disjoint i64 %shl, %shl15 + %zext16 = zext i8 %load9 to i64 + %shl17 = shl nuw nsw i64 %zext16, 40 + %or18 = or disjoint i64 %or, %shl17 + %zext19 = zext i8 %load7 to i64 + %shl20 = shl nuw nsw i64 %zext19, 32 + %or21 = or disjoint i64 %or18, %shl20 + %zext22 = zext i8 %load5 to i64 + %shl23 = shl nuw nsw i64 %zext22, 24 + %or24 = or disjoint i64 %or21, %shl23 + %zext25 = zext i8 %load3 to i64 + %shl26 = shl nuw nsw i64 %zext25, 16 + %zext27 = zext i8 %load1 to i64 + %shl28 = shl nuw nsw i64 %zext27, 8 + %or29 = or disjoint i64 %or24, %shl26 + %zext30 = zext i8 %load to i64 + %or31 = or i64 %or29, %shl28 + %or32 = or i64 %or31, %zext30 + ret i64 %or32 +} + + +; The following test case reduced from a client kernel +define fastcc <16 x float> @hoge(ptr %arg) { +; CHECK-LABEL: define fastcc <16 x float> @hoge( +; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8 +; CHECK-NEXT: [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1 +; CHECK-NEXT: [[GETELEMENTPTR72:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8 +; CHECK-NEXT: [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1 +; CHECK-NEXT: [[GETELEMENTPTR120:%.*]] = getelementptr i8, ptr [[LOAD]], i64 16 +; CHECK-NEXT: [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1 +; CHECK-NEXT: [[GETELEMENTPTR168:%.*]] = getelementptr i8, ptr [[LOAD]], i64 24 +; CHECK-NEXT: [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1 +; CHECK-NEXT: [[CALL:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL225:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 0, <16 x float> [[CALL]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL230:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 0, <16 x float> [[CALL225]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL235:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 0, <16 x float> [[CALL230]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <16 x float> [[CALL235]] +; +bb: + %load = load ptr, ptr %arg, align 8 + %load28 = load i8, ptr %load, align 1 + %getelementptr30 = getelementptr i8, ptr %load, i64 1 + %load31 = load i8, ptr %getelementptr30, align 1 + %getelementptr36 = getelementptr i8, ptr %load, i64 2 + %load37 = load i8, ptr %getelementptr36, align 1 + %getelementptr42 = getelementptr i8, ptr %load, i64 3 + %load43 = load i8, ptr %getelementptr42, align 1 + %getelementptr48 = getelementptr i8, ptr %load, i64 4 + %load49 = load i8, ptr %getelementptr48, align 1 + %getelementptr54 = getelementptr i8, ptr %load, i64 5 + %load55 = load i8, ptr %getelementptr54, align 1 + %getelementptr60 = getelementptr i8, ptr %load, i64 6 + %load61 = load i8, ptr %getelementptr60, align 1 + %getelementptr66 = getelementptr i8, ptr %load, i64 7 + %load67 = load i8, ptr %getelementptr66, align 1 + %getelementptr72 = getelementptr i8, ptr %load, i64 8 + %load73 = load i8, ptr %getelementptr72, align 1 + %getelementptr78 = getelementptr i8, ptr %load, i64 9 + %load79 = load i8, ptr %getelementptr78, align 1 + %getelementptr84 = getelementptr i8, ptr %load, i64 10 + %load85 = load i8, ptr %getelementptr84, align 1 + %getelementptr90 = getelementptr i8, ptr %load, i64 11 + %load91 = load i8, ptr %getelementptr90, align 1 + %getelementptr96 = getelementptr i8, ptr %load, i64 12 + %load97 = load i8, ptr %getelementptr96, align 1 + %getelementptr102 = getelementptr i8, ptr %load, i64 13 + %load103 = load i8, ptr %getelementptr102, align 1 + %getelementptr108 = getelementptr i8, ptr %load, i64 14 + %load109 = load i8, ptr %getelementptr108, align 1 + %getelementptr114 = getelementptr i8, ptr %load, i64 15 + %load115 = load i8, ptr %getelementptr114, align 1 + %getelementptr120 = getelementptr i8, ptr %load, i64 16 + %load121 = load i8, ptr %getelementptr120, align 1 + %getelementptr126 = getelementptr i8, ptr %load, i64 17 + %load127 = load i8, ptr %getelementptr126, align 1 + %getelementptr132 = getelementptr i8, ptr %load, i64 18 + %load133 = load i8, ptr %getelementptr132, align 1 + %getelementptr138 = getelementptr i8, ptr %load, i64 19 + %load139 = load i8, ptr %getelementptr138, align 1 + %getelementptr144 = getelementptr i8, ptr %load, i64 20 + %load145 = load i8, ptr %getelementptr144, align 1 + %getelementptr150 = getelementptr i8, ptr %load, i64 21 + %load151 = load i8, ptr %getelementptr150, align 1 + %getelementptr156 = getelementptr i8, ptr %load, i64 22 + %load157 = load i8, ptr %getelementptr156, align 1 + %getelementptr162 = getelementptr i8, ptr %load, i64 23 + %load163 = load i8, ptr %getelementptr162, align 1 + %getelementptr168 = getelementptr i8, ptr %load, i64 24 + %load169 = load i8, ptr %getelementptr168, align 1 + %getelementptr174 = getelementptr i8, ptr %load, i64 25 + %load175 = load i8, ptr %getelementptr174, align 1 + %getelementptr180 = getelementptr i8, ptr %load, i64 26 + %load181 = load i8, ptr %getelementptr180, align 1 + %getelementptr186 = getelementptr i8, ptr %load, i64 27 + %load187 = load i8, ptr %getelementptr186, align 1 + %getelementptr192 = getelementptr i8, ptr %load, i64 28 + %load193 = load i8, ptr %getelementptr192, align 1 + %getelementptr198 = getelementptr i8, ptr %load, i64 29 + %load199 = load i8, ptr %getelementptr198, align 1 + %getelementptr204 = getelementptr i8, ptr %load, i64 30 + %load205 = load i8, ptr %getelementptr204, align 1 + %getelementptr210 = getelementptr i8, ptr %load, i64 31 + %load211 = load i8, ptr %getelementptr210, align 1 + %alloca1.sroa.8.0.insert.ext = zext i8 %load67 to i64 + %alloca1.sroa.8.0.insert.shift = shl i64 %alloca1.sroa.8.0.insert.ext, 56 + %alloca1.sroa.7.0.insert.ext = zext i8 %load61 to i64 + %alloca1.sroa.7.0.insert.shift = shl i64 %alloca1.sroa.7.0.insert.ext, 48 + %alloca1.sroa.7.0.insert.insert = or i64 %alloca1.sroa.8.0.insert.shift, %alloca1.sroa.7.0.insert.shift + %alloca1.sroa.6.0.insert.ext = zext i8 %load55 to i64 + %alloca1.sroa.6.0.insert.shift = shl i64 %alloca1.sroa.6.0.insert.ext, 40 + %alloca1.sroa.6.0.insert.insert = or i64 %alloca1.sroa.7.0.insert.insert, %alloca1.sroa.6.0.insert.shift + %alloca1.sroa.5.0.insert.ext = zext i8 %load49 to i64 + %alloca1.sroa.5.0.insert.shift = shl i64 %alloca1.sroa.5.0.insert.ext, 32 + %alloca1.sroa.5.0.insert.insert = or i64 %alloca1.sroa.6.0.insert.insert, %alloca1.sroa.5.0.insert.shift + %alloca1.sroa.4.0.insert.ext = zext i8 %load43 to i64 + %alloca1.sroa.4.0.insert.shift = shl i64 %alloca1.sroa.4.0.insert.ext, 24 + %alloca1.sroa.4.0.insert.insert = or i64 %alloca1.sroa.5.0.insert.insert, %alloca1.sroa.4.0.insert.shift + %alloca1.sroa.3.0.insert.ext = zext i8 %load37 to i64 + %alloca1.sroa.3.0.insert.shift = shl i64 %alloca1.sroa.3.0.insert.ext, 16 + %alloca1.sroa.2.0.insert.ext = zext i8 %load31 to i64 + %alloca1.sroa.2.0.insert.shift = shl i64 %alloca1.sroa.2.0.insert.ext, 8 + %alloca1.sroa.2.0.insert.mask = or i64 %alloca1.sroa.4.0.insert.insert, %alloca1.sroa.3.0.insert.shift + %alloca1.sroa.0.0.insert.ext = zext i8 %load28 to i64 + %alloca1.sroa.0.0.insert.mask = or i64 %alloca1.sroa.2.0.insert.mask, %alloca1.sroa.2.0.insert.shift + %alloca1.sroa.0.0.insert.insert = or i64 %alloca1.sroa.0.0.insert.mask, %alloca1.sroa.0.0.insert.ext + %call = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.0.0.insert.insert, i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + %alloca1.sroa.17.8.insert.ext = zext i8 %load115 to i64 + %alloca1.sroa.17.8.insert.shift = shl i64 %alloca1.sroa.17.8.insert.ext, 56 + %alloca1.sroa.16.8.insert.ext = zext i8 %load109 to i64 + %alloca1.sroa.16.8.insert.shift = shl i64 %alloca1.sroa.16.8.insert.ext, 48 + %alloca1.sroa.16.8.insert.insert = or i64 %alloca1.sroa.17.8.insert.shift, %alloca1.sroa.16.8.insert.shift + %alloca1.sroa.15.8.insert.ext = zext i8 %load103 to i64 + %alloca1.sroa.15.8.insert.shift = shl i64 %alloca1.sroa.15.8.insert.ext, 40 + %alloca1.sroa.15.8.insert.insert = or i64 %alloca1.sroa.16.8.insert.insert, %alloca1.sroa.15.8.insert.shift + %alloca1.sroa.14.8.insert.ext = zext i8 %load97 to i64 + %alloca1.sroa.14.8.insert.shift = shl i64 %alloca1.sroa.14.8.insert.ext, 32 + %alloca1.sroa.14.8.insert.insert = or i64 %alloca1.sroa.15.8.insert.insert, %alloca1.sroa.14.8.insert.shift + %alloca1.sroa.13.8.insert.ext = zext i8 %load91 to i64 + %alloca1.sroa.13.8.insert.shift = shl i64 %alloca1.sroa.13.8.insert.ext, 24 + %alloca1.sroa.13.8.insert.insert = or i64 %alloca1.sroa.14.8.insert.insert, %alloca1.sroa.13.8.insert.shift + %alloca1.sroa.12.8.insert.ext = zext i8 %load85 to i64 + %alloca1.sroa.12.8.insert.shift = shl i64 %alloca1.sroa.12.8.insert.ext, 16 + %alloca1.sroa.11.8.insert.ext = zext i8 %load79 to i64 + %alloca1.sroa.11.8.insert.shift = shl i64 %alloca1.sroa.11.8.insert.ext, 8 + %alloca1.sroa.11.8.insert.mask = or i64 %alloca1.sroa.13.8.insert.insert, %alloca1.sroa.12.8.insert.shift + %alloca1.sroa.9.8.insert.ext = zext i8 %load73 to i64 + %alloca1.sroa.9.8.insert.mask = or i64 %alloca1.sroa.11.8.insert.mask, %alloca1.sroa.11.8.insert.shift + %alloca1.sroa.9.8.insert.insert = or i64 %alloca1.sroa.9.8.insert.mask, %alloca1.sroa.9.8.insert.ext + %call225 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.9.8.insert.insert, i64 0, <16 x float> %call, i32 0, i32 0, i32 0) + %alloca1.sroa.26.16.insert.ext = zext i8 %load163 to i64 + %alloca1.sroa.26.16.insert.shift = shl i64 %alloca1.sroa.26.16.insert.ext, 56 + %alloca1.sroa.25.16.insert.ext = zext i8 %load157 to i64 + %alloca1.sroa.25.16.insert.shift = shl i64 %alloca1.sroa.25.16.insert.ext, 48 + %alloca1.sroa.25.16.insert.insert = or i64 %alloca1.sroa.26.16.insert.shift, %alloca1.sroa.25.16.insert.shift + %alloca1.sroa.24.16.insert.ext = zext i8 %load151 to i64 + %alloca1.sroa.24.16.insert.shift = shl i64 %alloca1.sroa.24.16.insert.ext, 40 + %alloca1.sroa.24.16.insert.insert = or i64 %alloca1.sroa.25.16.insert.insert, %alloca1.sroa.24.16.insert.shift + %alloca1.sroa.23.16.insert.ext = zext i8 %load145 to i64 + %alloca1.sroa.23.16.insert.shift = shl i64 %alloca1.sroa.23.16.insert.ext, 32 + %alloca1.sroa.23.16.insert.insert = or i64 %alloca1.sroa.24.16.insert.insert, %alloca1.sroa.23.16.insert.shift + %alloca1.sroa.22.16.insert.ext = zext i8 %load139 to i64 + %alloca1.sroa.22.16.insert.shift = shl i64 %alloca1.sroa.22.16.insert.ext, 24 + %alloca1.sroa.22.16.insert.insert = or i64 %alloca1.sroa.23.16.insert.insert, %alloca1.sroa.22.16.insert.shift + %alloca1.sroa.21.16.insert.ext = zext i8 %load133 to i64 + %alloca1.sroa.21.16.insert.shift = shl i64 %alloca1.sroa.21.16.insert.ext, 16 + %alloca1.sroa.20.16.insert.ext = zext i8 %load127 to i64 + %alloca1.sroa.20.16.insert.shift = shl i64 %alloca1.sroa.20.16.insert.ext, 8 + %alloca1.sroa.20.16.insert.mask = or i64 %alloca1.sroa.22.16.insert.insert, %alloca1.sroa.21.16.insert.shift + %alloca1.sroa.18.16.insert.ext = zext i8 %load121 to i64 + %alloca1.sroa.18.16.insert.mask = or i64 %alloca1.sroa.20.16.insert.mask, %alloca1.sroa.20.16.insert.shift + %alloca1.sroa.18.16.insert.insert = or i64 %alloca1.sroa.18.16.insert.mask, %alloca1.sroa.18.16.insert.ext + %call230 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.18.16.insert.insert, i64 0, <16 x float> %call225, i32 0, i32 0, i32 0) + %alloca1.sroa.35.24.insert.ext = zext i8 %load211 to i64 + %alloca1.sroa.35.24.insert.shift = shl i64 %alloca1.sroa.35.24.insert.ext, 56 + %alloca1.sroa.34.24.insert.ext = zext i8 %load205 to i64 + %alloca1.sroa.34.24.insert.shift = shl i64 %alloca1.sroa.34.24.insert.ext, 48 + %alloca1.sroa.34.24.insert.insert = or i64 %alloca1.sroa.35.24.insert.shift, %alloca1.sroa.34.24.insert.shift + %alloca1.sroa.33.24.insert.ext = zext i8 %load199 to i64 + %alloca1.sroa.33.24.insert.shift = shl i64 %alloca1.sroa.33.24.insert.ext, 40 + %alloca1.sroa.33.24.insert.insert = or i64 %alloca1.sroa.34.24.insert.insert, %alloca1.sroa.33.24.insert.shift + %alloca1.sroa.32.24.insert.ext = zext i8 %load193 to i64 + %alloca1.sroa.32.24.insert.shift = shl i64 %alloca1.sroa.32.24.insert.ext, 32 + %alloca1.sroa.32.24.insert.insert = or i64 %alloca1.sroa.33.24.insert.insert, %alloca1.sroa.32.24.insert.shift + %alloca1.sroa.31.24.insert.ext = zext i8 %load187 to i64 + %alloca1.sroa.31.24.insert.shift = shl i64 %alloca1.sroa.31.24.insert.ext, 24 + %alloca1.sroa.31.24.insert.insert = or i64 %alloca1.sroa.32.24.insert.insert, %alloca1.sroa.31.24.insert.shift + %alloca1.sroa.30.24.insert.ext = zext i8 %load181 to i64 + %alloca1.sroa.30.24.insert.shift = shl i64 %alloca1.sroa.30.24.insert.ext, 16 + %alloca1.sroa.29.24.insert.ext = zext i8 %load175 to i64 + %alloca1.sroa.29.24.insert.shift = shl i64 %alloca1.sroa.29.24.insert.ext, 8 + %alloca1.sroa.29.24.insert.mask = or i64 %alloca1.sroa.31.24.insert.insert, %alloca1.sroa.30.24.insert.shift + %alloca1.sroa.27.24.insert.ext = zext i8 %load169 to i64 + %alloca1.sroa.27.24.insert.mask = or i64 %alloca1.sroa.29.24.insert.mask, %alloca1.sroa.29.24.insert.shift + %alloca1.sroa.27.24.insert.insert = or i64 %alloca1.sroa.27.24.insert.mask, %alloca1.sroa.27.24.insert.ext + %call235 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.27.24.insert.insert, i64 0, <16 x float> %call230, i32 0, i32 0, i32 0) + ret <16 x float> %call235 +} + +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0 + +attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg new file mode 100644 index 0000000000000..7c492428aec76 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AMDGPU" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll index 19cb7d21471d9..d821caa5b1c6b 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll @@ -151,8 +151,17 @@ define i16 @test_atomicrmw_and_i16_global_system(ptr addrspace(1) %ptr, i16 %val ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -204,8 +213,17 @@ define i16 @test_atomicrmw_or_i16_global_system(ptr addrspace(1) %ptr, i16 %valu ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -224,8 +242,17 @@ define i16 @test_atomicrmw_xor_i16_global_system(ptr addrspace(1) %ptr, i16 %val ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll index c49909597c72c..72fc4f468543a 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll @@ -140,33 +140,74 @@ define i16 @test_atomicrmw_sub_i16_global_agent(ptr addrspace(1) %ptr, i16 %valu } define i16 @test_atomicrmw_and_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4 ret i16 %res @@ -174,21 +215,46 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4(ptr addrspace(1) %ptr, i1 ; Drop unknown metadata and noundef define i16 @test_atomicrmw_and_i16_global_agent_drop_md(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_drop_md( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_drop_md( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_drop_md( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !noundef !0, !some.unknown.md !0 ret i16 %res @@ -196,12 +262,28 @@ define i16 @test_atomicrmw_and_i16_global_agent_drop_md(ptr addrspace(1) %ptr, i ; Drop unknown metadata define i16 @test_atomicrmw_and_i16_global_agent_align4_drop_md(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_drop_md( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4_drop_md( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4_drop_md( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noundef !0, !some.unknown.md !0 ret i16 %res @@ -209,21 +291,46 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4_drop_md(ptr addrspace(1) ; Drop noundef, preserve mmra define i16 @test_atomicrmw_and_i16_global_agent_preserve_mmra(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_mmra( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0:![0-9]+]] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_mmra( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4, !mmra [[META0:![0-9]+]] +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !mmra [[META0]] +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_mmra( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0:![0-9]+]] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !noundef !0, !mmra !1 ret i16 %res @@ -231,126 +338,252 @@ define i16 @test_atomicrmw_and_i16_global_agent_preserve_mmra(ptr addrspace(1) % ; Drop noundef, preserve mmra define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4, !mmra [[META0]] +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !mmra [[META0]] +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noundef !0, !mmra !1 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !alias.scope [[META1:![0-9]+]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !alias.scope [[META1:![0-9]+]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !alias.scope !2 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !noalias [[META1]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !noalias [[META1]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noalias !2 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !tbaa.struct !5 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa [[TBAA5:![0-9]+]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa [[TBAA5:![0-9]+]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !tbaa !6 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8:![0-9]+]] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; GCN-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1:![0-9]+]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8:![0-9]+]] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]] -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; GCN-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]] +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]] +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret i16 %res } define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory( -; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 -; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory( +; GCN-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; GCN-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory( +; R600-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536 +; R600-NEXT: [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret i16 %res @@ -390,40 +623,88 @@ define i16 @test_atomicrmw_nand_i16_global_agent(ptr addrspace(1) %ptr, i16 %val } define i16 @test_atomicrmw_or_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_or_i16_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_or_i16_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_or_i16_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst ret i16 %res } define i16 @test_atomicrmw_xor_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) { -; CHECK-LABEL: @test_atomicrmw_xor_i16_global_agent( -; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 -; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] -; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CHECK-NEXT: ret i16 [[EXTRACTED]] +; GCN-LABEL: @test_atomicrmw_xor_i16_global_agent( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; GCN-NEXT: ret i16 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_xor_i16_global_agent( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; R600-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; R600-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; R600-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; R600-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; R600-NEXT: ret i16 [[EXTRACTED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst ret i16 %res @@ -1362,6 +1643,102 @@ define i16 @test_atomicrmw_add_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr ret i16 %res } +define i16 @test_atomicrmw_sub_i16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_sub_i16_global_agent__amdgpu_no_remote_memory( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret i16 %res +} + +define i16 @test_atomicrmw_sub_i16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_sub_i16_global_agent__amdgpu_no_fine_grained_memory( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i16 %res +} + +define i16 @test_atomicrmw_sub_i16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_sub_i16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret i16 %res +} + !0 = !{} !1 = !{!"foo", !"bar"} !2 = !{!3} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll index a1007bacd522f..1440045d11e2d 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll @@ -110,10 +110,99 @@ define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, sub is not supported over PCIe define i32 @test_atomicrmw_sub_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -155,10 +244,99 @@ define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_and_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_and_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -285,10 +463,99 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory_ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_or_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_or_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -330,10 +597,99 @@ define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_fine_grained_memory__a ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_xor_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -375,10 +731,105 @@ define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_max_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_max_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -420,10 +871,105 @@ define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_min_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_min_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -465,10 +1011,105 @@ define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_umax_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -510,10 +1151,105 @@ define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_fine_grained_memory_ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_umin_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -555,10 +1291,111 @@ define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_fine_grained_memory_ ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_uinc_wrap_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; GFX803-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; GFX906-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; GFX908-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; GFX90A-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; GFX10-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; GFX11-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -600,10 +1437,123 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_fine_grained_me ; expansion is necessary, operation not supported over PCIe define i32 @test_atomicrmw_udec_wrap_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) { -; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 -; COMMON-NEXT: ret i32 [[NEWLOADED]] +; GFX803-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; GFX803-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; GFX803-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; GFX803-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i32 [[NEWLOADED]] +; +; GFX906-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; GFX906-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; GFX906-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; GFX906-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i32 [[NEWLOADED]] +; +; GFX908-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; GFX908-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; GFX908-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i32 [[NEWLOADED]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; GFX90A-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; GFX90A-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i32 [[NEWLOADED]] +; +; GFX940-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; GFX10-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; GFX10-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; GFX10-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i32 [[NEWLOADED]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; GFX11-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; GFX11-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i32 [[NEWLOADED]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst ret i32 %res @@ -657,12 +1607,3 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_fine_grained_me ;. ; GFX12: [[META0]] = !{} ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX942: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll index 08288848efd66..b8e7c3eb4673a 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll @@ -112,7 +112,16 @@ define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory_ define i32 @test_atomicrmw_sub_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst @@ -157,7 +166,16 @@ define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory_ define i32 @test_atomicrmw_and_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst @@ -287,7 +305,16 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory define i32 @test_atomicrmw_or_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst @@ -332,7 +359,16 @@ define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__ define i32 @test_atomicrmw_xor_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst @@ -397,17 +433,7 @@ define i32 @test_atomicrmw_max_i32_global_system(ptr addrspace(1) %ptr, i32 %val define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -417,17 +443,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory( define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -437,17 +453,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory(ptr ad define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -482,17 +488,7 @@ define i32 @test_atomicrmw_min_i32_global_system(ptr addrspace(1) %ptr, i32 %val define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -502,17 +498,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory( define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -522,17 +508,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory(ptr ad define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -567,17 +543,7 @@ define i32 @test_atomicrmw_umax_i32_global_system(ptr addrspace(1) %ptr, i32 %va define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -587,17 +553,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -607,17 +563,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory(ptr a define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -652,17 +598,7 @@ define i32 @test_atomicrmw_umin_i32_global_system(ptr addrspace(1) %ptr, i32 %va define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -672,17 +608,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -692,17 +618,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory(ptr a define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -717,7 +633,18 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory define i32 @test_atomicrmw_uinc_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst @@ -762,7 +689,20 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_m define i32 @test_atomicrmw_udec_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll index 7586a0af43c95..8bc481408fe73 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll @@ -110,10 +110,99 @@ define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, sub is not supported over PCIe define i64 @test_atomicrmw_sub_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -155,10 +244,99 @@ define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_and_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_and_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -285,10 +463,99 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory_ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_or_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_or_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -330,10 +597,99 @@ define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_fine_grained_memory__a ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_xor_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -375,10 +731,105 @@ define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_max_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_max_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -420,10 +871,105 @@ define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_min_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_min_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -465,10 +1011,105 @@ define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_fine_grained_memory__ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_umax_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -510,10 +1151,105 @@ define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_fine_grained_memory_ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_umin_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX803-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX906-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX10-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -555,10 +1291,111 @@ define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_fine_grained_memory_ ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_uinc_wrap_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; GFX803-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; GFX906-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; GFX908-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; GFX90A-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; GFX10-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; GFX11-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -600,10 +1437,123 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_fine_grained_me ; expansion is necessary, operation not supported over PCIe define i64 @test_atomicrmw_udec_wrap_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) { -; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 -; COMMON-NEXT: ret i64 [[NEWLOADED]] +; GFX803-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; GFX803-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; GFX803-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX803-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; GFX803-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GFX803-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret i64 [[NEWLOADED]] +; +; GFX906-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; GFX906-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; GFX906-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX906-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; GFX906-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GFX906-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; GFX908-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; GFX908-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX908-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; GFX90A-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; GFX90A-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX90A-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX10-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; GFX10-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; GFX10-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX10-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; GFX10-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GFX10-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret i64 [[NEWLOADED]] +; +; GFX11-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; GFX11-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; GFX11-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; GFX11-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret i64 [[NEWLOADED]] +; +; GFX12-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8 +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst ret i64 %res @@ -657,12 +1607,3 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_fine_grained_me ;. ; GFX12: [[META0]] = !{} ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX942: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll index 4f3979f25076e..c895eebcf0f8d 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll @@ -112,7 +112,16 @@ define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory_ define i64 @test_atomicrmw_sub_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst @@ -157,7 +166,16 @@ define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory_ define i64 @test_atomicrmw_and_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst @@ -287,7 +305,16 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory define i64 @test_atomicrmw_or_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst @@ -332,7 +359,16 @@ define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__ define i64 @test_atomicrmw_xor_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst @@ -397,17 +433,7 @@ define i64 @test_atomicrmw_max_i64_global_system(ptr addrspace(1) %ptr, i64 %val define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -417,17 +443,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory( define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -437,17 +453,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory(ptr ad define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -482,17 +488,7 @@ define i64 @test_atomicrmw_min_i64_global_system(ptr addrspace(1) %ptr, i64 %val define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -502,17 +498,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory( define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -522,17 +508,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory(ptr ad define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -567,17 +543,7 @@ define i64 @test_atomicrmw_umax_i64_global_system(ptr addrspace(1) %ptr, i64 %va define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -587,17 +553,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -607,17 +563,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory(ptr a define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -652,17 +598,7 @@ define i64 @test_atomicrmw_umin_i64_global_system(ptr addrspace(1) %ptr, i64 %va define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -672,17 +608,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -692,17 +618,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory(ptr a define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -717,7 +633,18 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory define i64 @test_atomicrmw_uinc_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst @@ -762,7 +689,20 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_m define i64 @test_atomicrmw_udec_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll index 088371f461ec1..b548943a326b8 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll @@ -162,8 +162,17 @@ define i8 @test_atomicrmw_and_i8_global_system(ptr addrspace(1) %ptr, i8 %value) ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -215,8 +224,17 @@ define i8 @test_atomicrmw_or_i8_global_system(ptr addrspace(1) %ptr, i8 %value) ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -235,8 +253,17 @@ define i8 @test_atomicrmw_xor_i8_global_system(ptr addrspace(1) %ptr, i8 %value) ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll index 590ee63001615..b19a717b56938 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll @@ -265,8 +265,17 @@ define i8 @test_atomicrmw_and_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) ; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; GCN-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; GCN-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4 -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; GCN-NEXT: ret i8 [[EXTRACTED]] ; @@ -360,8 +369,17 @@ define i8 @test_atomicrmw_or_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) { ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; GCN-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; GCN-NEXT: ret i8 [[EXTRACTED]] ; @@ -394,8 +412,17 @@ define i8 @test_atomicrmw_xor_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) ; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; GCN-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4 -; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; GCN-NEXT: ret i8 [[EXTRACTED]] ; @@ -1712,3 +1739,179 @@ define i8 @test_atomicrmw_add_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4 ret i8 %res } + +define i8 @test_atomicrmw_sub_i8_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i8 %value) { +; GCN-LABEL: @test_atomicrmw_sub_i8_global_agent__amdgpu_no_remote_memory( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_sub_i8_global_agent__amdgpu_no_remote_memory( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret i8 %res +} + +define i8 @test_atomicrmw_sub_i8_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i8 %value) { +; GCN-LABEL: @test_atomicrmw_sub_i8_global_agent__amdgpu_no_fine_grained_memory( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_sub_i8_global_agent__amdgpu_no_fine_grained_memory( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i8 %res +} + +define i8 @test_atomicrmw_sub_i8_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i8 %value) { +; GCN-LABEL: @test_atomicrmw_sub_i8_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]] +; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; GCN-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; GCN-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; GCN-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; GCN-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; GCN-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; GCN-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end: +; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; GCN-NEXT: ret i8 [[EXTRACTED]] +; +; R600-LABEL: @test_atomicrmw_sub_i8_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4) +; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32 +; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; R600-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; R600-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; R600-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; R600-NEXT: br label [[ATOMICRMW_START:%.*]] +; R600: atomicrmw.start: +; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; R600-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; R600-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; R600-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; R600-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; R600: atomicrmw.end: +; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; R600-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret i8 %res +} + +!0 = !{} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll index 1bf821fd53eab..f8eb16f61e3b7 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -332,10 +332,71 @@ define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 } define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { -; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( -; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] -; ALL-NEXT: ret i64 [[RES]] +; GFX7-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX7-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret i64 [[NEWLOADED]] +; +; GFX900-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX900-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX12-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 ret i64 %res @@ -424,10 +485,71 @@ define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5(ptr %ptr, i64 } define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { -; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( -; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] -; ALL-NEXT: ret i64 [[RES]] +; GFX7-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX7-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret i64 [[NEWLOADED]] +; +; GFX900-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX900-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret i64 [[NEWLOADED]] +; +; GFX908-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret i64 [[NEWLOADED]] +; +; GFX90A-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret i64 [[NEWLOADED]] +; +; GFX940-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX940-NEXT: ret i64 [[RES]] +; +; GFX12-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] +; GFX12-NEXT: ret i64 [[RES]] ; %res = atomicrmw sub ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 ret i64 %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index dc1107d9130d5..185f42ebc028c 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -69,10 +69,40 @@ define i32 @test_atomicrmw_or_0_as999_system(ptr addrspace(999) %ptr) { ; Leave as-is, only system scope should be changed. define i32 @test_atomicrmw_or_0_global_agent(ptr addrspace(1) %ptr) { -; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_agent( -; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 -; CHECK-NEXT: ret i32 [[RES]] +; GFX803-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX803-NEXT: ret i32 [[RES]] +; +; GFX900-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX900-NEXT: ret i32 [[RES]] +; +; GFX90A-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret i32 [[RES]] +; +; GFX10-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX10-NEXT: ret i32 [[RES]] +; +; GFX11-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX11-NEXT: ret i32 [[RES]] +; +; GFX942-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX942-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX942-NEXT: ret i32 [[RES]] +; +; GFX12-LABEL: define i32 @test_atomicrmw_or_0_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst ret i32 %res @@ -93,7 +123,16 @@ define i32 @test_atomicrmw_or_0_local(ptr addrspace(3) %ptr) { define i32 @test_atomicrmw_or_1_global_system(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_1_global_system( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 1 seq_cst, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 1 seq_cst @@ -103,7 +142,16 @@ define i32 @test_atomicrmw_or_1_global_system(ptr addrspace(1) %ptr) { define i32 @test_atomicrmw_or_var_global_system(ptr addrspace(1) %ptr, i32 %val) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_var_global_system( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VAL]] seq_cst, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VAL]] +; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %val seq_cst @@ -146,7 +194,7 @@ define i32 @test_atomicrmw_xor_0_global_system(ptr addrspace(1) %ptr) { define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -156,7 +204,7 @@ define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory(ptr define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_remote_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.remote.memory !0 @@ -166,7 +214,7 @@ define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_remote_memory(ptr addrs define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -176,7 +224,7 @@ define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory__am define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -186,7 +234,7 @@ define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory(pt define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_remote_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.remote.memory !0 @@ -196,7 +244,7 @@ define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_remote_memory(ptr addr define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -206,7 +254,7 @@ define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory__a define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -216,7 +264,7 @@ define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory(pt define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_remote_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.remote.memory !0 @@ -226,7 +274,7 @@ define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_remote_memory(ptr addr define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; CHECK-NEXT: [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -324,12 +372,3 @@ define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_fine_grained_memory__am } !0 = !{} - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX900: {{.*}} -; GFX90A: {{.*}} -; GFX942: {{.*}} diff --git a/llvm/test/Transforms/Inline/AMDGPU/load-intrinsics.ll b/llvm/test/Transforms/Inline/AMDGPU/load-intrinsics.ll new file mode 100644 index 0000000000000..46f53d8f82cfd --- /dev/null +++ b/llvm/test/Transforms/Inline/AMDGPU/load-intrinsics.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -mtriple=amdgcn --passes=inline --enable-noalias-to-md-conversion -S %s | FileCheck --check-prefix=OPT %s + +; This test tests if the load intrinsic gets correct memory(argmem: read) attribute and +; the call instruction is assigned correct !alias.scope metadata post inlining + +define void @caller(ptr addrspace(3) %addr_f, ptr addrspace(1) %use_f) { +; OPT-LABEL: define void @caller( +; OPT-SAME: ptr addrspace(3) [[ADDR_F:%.*]], ptr addrspace(1) [[USE_F:%.*]]) { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]]) +; OPT-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) +; OPT-NEXT: [[GEP_I:%.*]] = getelementptr i64, ptr addrspace(3) [[ADDR_F]], i32 4 +; OPT-NEXT: [[VAL_I:%.*]] = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[GEP_I]]), !alias.scope [[META0]], !noalias [[META3]] +; OPT-NEXT: store <2 x i32> [[VAL_I]], ptr addrspace(1) [[USE_F]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; OPT-NEXT: ret void +; +entry: + call void @callee(ptr addrspace(3) %addr_f, ptr addrspace(1) %use_f) + ret void +} + +define void @callee(ptr addrspace(3) noalias %addr, ptr addrspace(1) noalias %use) { +; OPT-LABEL: define void @callee( +; OPT-SAME: ptr addrspace(3) noalias [[ADDR:%.*]], ptr addrspace(1) noalias [[USE:%.*]]) { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[GEP:%.*]] = getelementptr i64, ptr addrspace(3) [[ADDR]], i32 4 +; OPT-NEXT: [[VAL:%.*]] = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[GEP]]) +; OPT-NEXT: store <2 x i32> [[VAL]], ptr addrspace(1) [[USE]], align 8 +; OPT-NEXT: ret void +; +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} +;. +; Check Function Attribute on decl +; OPT: declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) captures(none)) #[[ATTR0:[0-9]+]] +declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3)) +; OPT: attributes #[[ATTR0]] = { convergent nocallback nofree nounwind willreturn memory(argmem: read) } +; OPT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +;. +; OPT: [[META0]] = !{[[META1:![0-9]+]]} +; OPT: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee: %addr"} +; OPT: [[META2]] = distinct !{[[META2]], !"callee"} +; OPT: [[META3]] = !{[[META4:![0-9]+]]} +; OPT: [[META4]] = distinct !{[[META4]], [[META2]], !"callee: %use"} +;. diff --git a/llvm/test/Transforms/InstCombine/or-packed-int-vecs.ll b/llvm/test/Transforms/InstCombine/or-packed-int-vecs.ll new file mode 100644 index 0000000000000..9391fb5ddae97 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/or-packed-int-vecs.ll @@ -0,0 +1,926 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt %s -passes=instcombine -data-layout="E" -S | FileCheck %s --check-prefixes=CHECK,CHECK-BE +; RUN: opt %s -passes=instcombine -data-layout="e" -S | FileCheck %s --check-prefixes=CHECK,CHECK-LE + +define i32 @bitcast.v2i.le(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.le( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.le( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 0 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 2 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <4 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i32 @bitcast.v2i.be(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.be( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.be( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 2 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 1 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <4 x i8> %v, i64 0 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i64 @bitcast.v2i.le.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.le.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.le.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 0 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 2 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <4 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i64 @bitcast.v2i.be.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.be.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.be.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 2 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 1 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <4 x i16> %v, i64 0 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i32 @bitcast.v2i.le.tree(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.le.tree( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[S_2]], [[S_3]] +; CHECK-BE-NEXT: [[X:%.*]] = or disjoint i32 [[X_1]], [[X_3]] +; CHECK-BE-NEXT: ret i32 [[X]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.le.tree( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-LE-NEXT: ret i32 [[X]] +; + %v.0 = extractelement <4 x i8> %v, i64 0 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 2 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + + %v.3 = extractelement <4 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %s.2, %s.3 + + %x = or i32 %x.1, %x.3 + + ret i32 %x +} + +define i32 @bitcast.v2i.be.tree(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @bitcast.v2i.be.tree( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X:%.*]] = bitcast <4 x i8> [[V]] to i32 +; CHECK-BE-NEXT: ret i32 [[X]] +; +; CHECK-LE-LABEL: define i32 @bitcast.v2i.be.tree( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[S_2]], [[S_3]] +; CHECK-LE-NEXT: [[X:%.*]] = or disjoint i32 [[X_1]], [[X_3]] +; CHECK-LE-NEXT: ret i32 [[X]] +; + %v.0 = extractelement <4 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 2 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <4 x i8> %v, i64 1 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + + %v.3 = extractelement <4 x i8> %v, i64 0 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %s.2, %s.3 + + %x = or i32 %x.1, %x.3 + + ret i32 %x +} + +define i64 @bitcast.v2i.le.tree.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.le.tree.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[S_2]], [[S_3]] +; CHECK-BE-NEXT: [[X:%.*]] = or disjoint i64 [[X_1]], [[X_3]] +; CHECK-BE-NEXT: ret i64 [[X]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.le.tree.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-LE-NEXT: ret i64 [[X]] +; + %v.0 = extractelement <4 x i16> %v, i64 0 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 2 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + + %v.3 = extractelement <4 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %s.2, %s.3 + + %x = or i64 %x.1, %x.3 + + ret i64 %x +} + +define i64 @bitcast.v2i.be.tree.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @bitcast.v2i.be.tree.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X:%.*]] = bitcast <4 x i16> [[V]] to i64 +; CHECK-BE-NEXT: ret i64 [[X]] +; +; CHECK-LE-LABEL: define i64 @bitcast.v2i.be.tree.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 2 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[S_2]], [[S_3]] +; CHECK-LE-NEXT: [[X:%.*]] = or disjoint i64 [[X_1]], [[X_3]] +; CHECK-LE-NEXT: ret i64 [[X]] +; + %v.0 = extractelement <4 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 2 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <4 x i16> %v, i64 1 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + + %v.3 = extractelement <4 x i16> %v, i64 0 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %s.2, %s.3 + + %x = or i64 %x.1, %x.3 + + ret i64 %x +} + +define i32 @extract.le.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @extract.le.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <8 x i8> [[V]], i64 5 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @extract.le.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <8 x i32> +; CHECK-LE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i8> [[X_3_V_EXTRACT]] to <2 x i32> +; CHECK-LE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i32> [[X_3_V_BC]], i64 0 +; CHECK-LE-NEXT: ret i32 [[X_3_V_EXTRACT1]] +; + %v.0 = extractelement <8 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <8 x i8> %v, i64 4 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <8 x i8> %v, i64 5 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <8 x i8> %v, i64 6 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i32 @extract.be.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @extract.be.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <8 x i32> +; CHECK-BE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i8> [[X_3_V_EXTRACT]] to <2 x i32> +; CHECK-BE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i32> [[X_3_V_BC]], i64 0 +; CHECK-BE-NEXT: ret i32 [[X_3_V_EXTRACT1]] +; +; CHECK-LE-LABEL: define i32 @extract.be.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <8 x i8> [[V]], i64 5 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i32 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <8 x i8> %v, i64 6 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <8 x i8> %v, i64 5 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.2 = extractelement <8 x i8> %v, i64 4 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %x.1, %s.2 + + %v.3 = extractelement <8 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i64 @extract.le.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @extract.le.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 5 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @extract.le.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> poison, <8 x i32> +; CHECK-LE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i16> [[X_3_V_EXTRACT]] to <2 x i64> +; CHECK-LE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i64> [[X_3_V_BC]], i64 0 +; CHECK-LE-NEXT: ret i64 [[X_3_V_EXTRACT1]] +; + %v.0 = extractelement <8 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <8 x i16> %v, i64 4 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <8 x i16> %v, i64 5 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 6 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i64 @extract.be.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @extract.be.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> poison, <8 x i32> +; CHECK-BE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i16> [[X_3_V_EXTRACT]] to <2 x i64> +; CHECK-BE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i64> [[X_3_V_BC]], i64 0 +; CHECK-BE-NEXT: ret i64 [[X_3_V_EXTRACT1]] +; +; CHECK-LE-LABEL: define i64 @extract.be.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <8 x i16> [[V]], i64 5 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <8 x i16> %v, i64 6 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <8 x i16> %v, i64 5 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <8 x i16> %v, i64 4 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i32 @partial.le(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.le( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_1]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.le( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i8> [[V]], i8 0, i64 2 +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V1]] to i32 +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 0 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.3 = extractelement <4 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.1, %s.3 + + ret i32 %x.3 +} + +define i32 @partial.be(<4 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.be( +; CHECK-BE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i8> [[V]], i8 0, i64 2 +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V1]] to i32 +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.be( +; CHECK-LE-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i8> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 16 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i8> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_1]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <4 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <4 x i8> %v, i64 1 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 16 + %x.1 = or i32 %z.0, %s.1 + + %v.3 = extractelement <4 x i8> %v, i64 0 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.1, %s.3 + + ret i32 %x.3 +} + + +define i64 @partial.le.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.le.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_1]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.le.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i16> [[V]], i16 0, i64 2 +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[X_3_V1]] to i64 +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 0 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.3 = extractelement <4 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.1, %s.3 + + ret i64 %x.3 +} + +define i64 @partial.be.i16(<4 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.be.i16( +; CHECK-BE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V1:%.*]] = insertelement <4 x i16> [[V]], i16 0, i64 2 +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[X_3_V1]] to i64 +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.be.i16( +; CHECK-LE-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <4 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_1:%.*]] = extractelement <4 x i16> [[V]], i64 1 +; CHECK-LE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-LE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 32 +; CHECK-LE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <4 x i16> [[V]], i64 0 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_1]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <4 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <4 x i16> %v, i64 1 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 32 + %x.1 = or i64 %z.0, %s.1 + + %v.3 = extractelement <4 x i16> %v, i64 0 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.1, %s.3 + + ret i64 %x.3 +} + +define i32 @partial.extract.le.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.extract.le.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i8 [[V_1]] to i32 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i32 [[Z_1]], 8 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i32 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_1]], [[S_3]] +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.extract.le.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> , <4 x i32> +; CHECK-LE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V]] to i32 +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <8 x i8> %v, i64 3 + %z.0 = zext i8 %v.0 to i32 + + %v.1 = extractelement <8 x i8> %v, i64 4 + %z.1 = zext i8 %v.1 to i32 + %s.1 = shl i32 %z.1, 8 + %x.1 = or i32 %z.0, %s.1 + + %v.3 = extractelement <8 x i8> %v, i64 6 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.1, %s.3 + + ret i32 %x.3 +} + +define i32 @partial.extract.be.i32(<8 x i8> %v) { +; CHECK-BE-LABEL: define i32 @partial.extract.be.i32( +; CHECK-BE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> , <4 x i32> +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i8> [[X_3_V]] to i32 +; CHECK-BE-NEXT: ret i32 [[X_3]] +; +; CHECK-LE-LABEL: define i32 @partial.extract.be.i32( +; CHECK-LE-SAME: <8 x i8> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i8> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i8 [[V_0]] to i32 +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i8> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i8 [[V_2]] to i32 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i32 [[Z_2]], 16 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i32 [[S_2]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i8> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i8 [[V_3]] to i32 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i32 [[Z_3]], 24 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i32 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i32 [[X_3]] +; + %v.0 = extractelement <8 x i8> %v, i64 6 + %z.0 = zext i8 %v.0 to i32 + + %v.2 = extractelement <8 x i8> %v, i64 4 + %z.2 = zext i8 %v.2 to i32 + %s.2 = shl i32 %z.2, 16 + %x.2 = or i32 %z.0, %s.2 + + %v.3 = extractelement <8 x i8> %v, i64 3 + %z.3 = zext i8 %v.3 to i32 + %s.3 = shl i32 %z.3, 24 + %x.3 = or i32 %x.2, %s.3 + + ret i32 %x.3 +} + +define i64 @partial.extract.le.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.extract.le.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-BE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-BE-NEXT: [[V_1:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-BE-NEXT: [[Z_1:%.*]] = zext i16 [[V_1]] to i64 +; CHECK-BE-NEXT: [[S_1:%.*]] = shl nuw nsw i64 [[Z_1]], 16 +; CHECK-BE-NEXT: [[X_1:%.*]] = or disjoint i64 [[S_1]], [[Z_0]] +; CHECK-BE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 5 +; CHECK-BE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-BE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-BE-NEXT: [[X_2:%.*]] = or disjoint i64 [[X_1]], [[S_2]] +; CHECK-BE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-BE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-BE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-BE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.extract.le.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[X_3_V_EXTRACT:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> poison, <8 x i32> +; CHECK-LE-NEXT: [[X_3_V_BC:%.*]] = bitcast <8 x i16> [[X_3_V_EXTRACT]] to <2 x i64> +; CHECK-LE-NEXT: [[X_3_V_EXTRACT1:%.*]] = extractelement <2 x i64> [[X_3_V_BC]], i64 0 +; CHECK-LE-NEXT: ret i64 [[X_3_V_EXTRACT1]] +; + %v.0 = extractelement <8 x i16> %v, i64 3 + %z.0 = zext i16 %v.0 to i64 + + %v.1 = extractelement <8 x i16> %v, i64 4 + %z.1 = zext i16 %v.1 to i64 + %s.1 = shl i64 %z.1, 16 + %x.1 = or i64 %z.0, %s.1 + + %v.2 = extractelement <8 x i16> %v, i64 5 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %x.1, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 6 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define i64 @partial.extract.be.i64(<8 x i16> %v) { +; CHECK-BE-LABEL: define i64 @partial.extract.be.i64( +; CHECK-BE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-BE-NEXT: [[X_3_V:%.*]] = shufflevector <8 x i16> [[V]], <8 x i16> , <4 x i32> +; CHECK-BE-NEXT: [[X_3:%.*]] = bitcast <4 x i16> [[X_3_V]] to i64 +; CHECK-BE-NEXT: ret i64 [[X_3]] +; +; CHECK-LE-LABEL: define i64 @partial.extract.be.i64( +; CHECK-LE-SAME: <8 x i16> [[V:%.*]]) { +; CHECK-LE-NEXT: [[V_0:%.*]] = extractelement <8 x i16> [[V]], i64 6 +; CHECK-LE-NEXT: [[Z_0:%.*]] = zext i16 [[V_0]] to i64 +; CHECK-LE-NEXT: [[V_2:%.*]] = extractelement <8 x i16> [[V]], i64 4 +; CHECK-LE-NEXT: [[Z_2:%.*]] = zext i16 [[V_2]] to i64 +; CHECK-LE-NEXT: [[S_2:%.*]] = shl nuw nsw i64 [[Z_2]], 32 +; CHECK-LE-NEXT: [[X_2:%.*]] = or disjoint i64 [[S_2]], [[Z_0]] +; CHECK-LE-NEXT: [[V_3:%.*]] = extractelement <8 x i16> [[V]], i64 3 +; CHECK-LE-NEXT: [[Z_3:%.*]] = zext i16 [[V_3]] to i64 +; CHECK-LE-NEXT: [[S_3:%.*]] = shl nuw i64 [[Z_3]], 48 +; CHECK-LE-NEXT: [[X_3:%.*]] = or disjoint i64 [[X_2]], [[S_3]] +; CHECK-LE-NEXT: ret i64 [[X_3]] +; + %v.0 = extractelement <8 x i16> %v, i64 6 + %z.0 = zext i16 %v.0 to i64 + + %v.2 = extractelement <8 x i16> %v, i64 4 + %z.2 = zext i16 %v.2 to i64 + %s.2 = shl i64 %z.2, 32 + %x.2 = or i64 %z.0, %s.2 + + %v.3 = extractelement <8 x i16> %v, i64 3 + %z.3 = zext i16 %v.3 to i64 + %s.3 = shl i64 %z.3, 48 + %x.3 = or i64 %x.2, %s.3 + + ret i64 %x.3 +} + +define <2 x i16> @shufflecast.v2v(<4 x i8> %v) { +; CHECK-LABEL: define <2 x i16> @shufflecast.v2v( +; CHECK-SAME: <4 x i8> [[V:%.*]]) { +; CHECK-NEXT: [[W_3:%.*]] = bitcast <4 x i8> [[V]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[W_3]] +; + %v.0 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.0 = bitcast <4 x i8> %v.0 to <2 x i16> + + %v.1 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.1 = bitcast <4 x i8> %v.1 to <2 x i16> + %w.1 = or <2 x i16> %c.0, %c.1 + + %v.2 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.2 = bitcast <4 x i8> %v.2 to <2 x i16> + %w.2 = or <2 x i16> %w.1, %c.2 + + %v.3 = shufflevector <4 x i8> %v, <4 x i8> zeroinitializer, <4 x i32> + %c.3 = bitcast <4 x i8> %v.3 to <2 x i16> + %w.3 = or <2 x i16> %w.2, %c.3 + + ret <2 x i16> %w.3 +} + +define <2 x i32> @shufflecast.v2v.i16(<4 x i16> %v) { +; CHECK-LABEL: define <2 x i32> @shufflecast.v2v.i16( +; CHECK-SAME: <4 x i16> [[V:%.*]]) { +; CHECK-NEXT: [[W_3:%.*]] = bitcast <4 x i16> [[V]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[W_3]] +; + %v.0 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.0 = bitcast <4 x i16> %v.0 to <2 x i32> + + %v.1 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.1 = bitcast <4 x i16> %v.1 to <2 x i32> + %w.1 = or <2 x i32> %c.0, %c.1 + + %v.2 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.2 = bitcast <4 x i16> %v.2 to <2 x i32> + %w.2 = or <2 x i32> %w.1, %c.2 + + %v.3 = shufflevector <4 x i16> %v, <4 x i16> zeroinitializer, <4 x i32> + %c.3 = bitcast <4 x i16> %v.3 to <2 x i32> + %w.3 = or <2 x i32> %w.2, %c.3 + + ret <2 x i32> %w.3 +} + +define i32 @bitcast.v2i.half(<2 x half> %v) { +; CHECK-LABEL: define i32 @bitcast.v2i.half( +; CHECK-SAME: <2 x half> [[V:%.*]]) { +; CHECK-NEXT: [[X:%.*]] = bitcast <2 x half> [[V]] to i32 +; CHECK-NEXT: ret i32 [[X]] +; + %v.0 = insertelement <2 x half> %v, half 0.0, i64 1 + %x.0 = bitcast <2 x half> %v.0 to i32 + + %v.1 = insertelement <2 x half> %v, half 0.0, i64 0 + %x.1 = bitcast <2 x half> %v.1 to i32 + + %x = or i32 %x.0, %x.1 + ret i32 %x +} diff --git a/llvm/test/Transforms/InstCombine/repack-ints-thru-zext.ll b/llvm/test/Transforms/InstCombine/repack-ints-thru-zext.ll new file mode 100644 index 0000000000000..cc722187b8d61 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/repack-ints-thru-zext.ll @@ -0,0 +1,247 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=instcombine %s -S | FileCheck %s + +declare void @use.i32(i32) +declare void @use.i64(i64) + +define i64 @full_shl(i32 %x) { +; CHECK-LABEL: define i64 @full_shl( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_ZEXT:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[LO_SHL:%.*]] = shl nuw nsw i64 [[X_ZEXT]], 24 +; CHECK-NEXT: ret i64 [[LO_SHL]] +; + %lo = and i32 %x, u0xffff + %lo.zext = zext nneg i32 %lo to i64 + %lo.shl = shl nuw nsw i64 %lo.zext, 24 + + %hi = lshr i32 %x, 16 + %hi.zext = zext nneg i32 %hi to i64 + %hi.shl = shl nuw nsw i64 %hi.zext, 40 + + %res = or disjoint i64 %lo.shl, %hi.shl + ret i64 %res +} + +define <2 x i64> @full_shl_vec(<2 x i32> %v) { +; CHECK-LABEL: define <2 x i64> @full_shl_vec( +; CHECK-SAME: <2 x i32> [[V:%.*]]) { +; CHECK-NEXT: [[LO:%.*]] = and <2 x i32> [[V]], splat (i32 65535) +; CHECK-NEXT: [[V_ZEXT:%.*]] = zext nneg <2 x i32> [[LO]] to <2 x i64> +; CHECK-NEXT: [[LO_SHL:%.*]] = shl nuw nsw <2 x i64> [[V_ZEXT]], splat (i64 24) +; CHECK-NEXT: [[HI:%.*]] = lshr <2 x i32> [[V]], splat (i32 16) +; CHECK-NEXT: [[HI_ZEXT:%.*]] = zext nneg <2 x i32> [[HI]] to <2 x i64> +; CHECK-NEXT: [[HI_SHL:%.*]] = shl nuw nsw <2 x i64> [[HI_ZEXT]], splat (i64 40) +; CHECK-NEXT: [[RES:%.*]] = or disjoint <2 x i64> [[LO_SHL]], [[HI_SHL]] +; CHECK-NEXT: ret <2 x i64> [[RES]] +; + %lo = and <2 x i32> %v, splat(i32 u0xffff) + %lo.zext = zext nneg <2 x i32> %lo to <2 x i64> + %lo.shl = shl nuw nsw <2 x i64> %lo.zext, splat(i64 24) + + %hi = lshr <2 x i32> %v, splat(i32 16) + %hi.zext = zext nneg <2 x i32> %hi to <2 x i64> + %hi.shl = shl nuw nsw <2 x i64> %hi.zext, splat(i64 40) + + %res = or disjoint <2 x i64> %lo.shl, %hi.shl + ret <2 x i64> %res +} + +; u0xaabbccdd = -1430532899 +define i64 @partial_shl(i32 %x) { +; CHECK-LABEL: define i64 @partial_shl( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -1430532899 +; CHECK-NEXT: [[X_ZEXT:%.*]] = zext i32 [[X_MASK]] to i64 +; CHECK-NEXT: [[LO_SHL:%.*]] = shl nuw nsw i64 [[X_ZEXT]], 24 +; CHECK-NEXT: ret i64 [[LO_SHL]] +; + %lo = and i32 %x, u0xccdd + %lo.zext = zext nneg i32 %lo to i64 + %lo.shl = shl nuw nsw i64 %lo.zext, 24 + + %hi = lshr i32 %x, 16 + %hi.mask = and i32 %hi, u0xaabb + %hi.zext = zext nneg i32 %hi.mask to i64 + %hi.shl = shl nuw nsw i64 %hi.zext, 40 + + %res = or disjoint i64 %lo.shl, %hi.shl + ret i64 %res +} + +define i64 @shl_multi_use_shl(i32 %x) { +; CHECK-LABEL: define i64 @shl_multi_use_shl( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X]], 24 +; CHECK-NEXT: [[LO_SHL:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: call void @use.i64(i64 [[LO_SHL]]) +; CHECK-NEXT: [[HI:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[HI_ZEXT:%.*]] = zext nneg i32 [[HI]] to i64 +; CHECK-NEXT: [[HI_SHL:%.*]] = shl nuw nsw i64 [[HI_ZEXT]], 40 +; CHECK-NEXT: [[RES:%.*]] = or disjoint i64 [[HI_SHL]], [[LO_SHL]] +; CHECK-NEXT: ret i64 [[RES]] +; + %lo = and i32 %x, u0x00ff + %lo.zext = zext nneg i32 %lo to i64 + %lo.shl = shl nuw nsw i64 %lo.zext, 24 + call void @use.i64(i64 %lo.shl) + + %hi = lshr i32 %x, 16 + %hi.zext = zext nneg i32 %hi to i64 + %hi.shl = shl nuw nsw i64 %hi.zext, 40 + + %res = or disjoint i64 %lo.shl, %hi.shl + ret i64 %res +} + +define i64 @shl_multi_use_zext(i32 %x) { +; CHECK-LABEL: define i64 @shl_multi_use_zext( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[LO:%.*]] = and i32 [[X]], 255 +; CHECK-NEXT: [[LO_ZEXT:%.*]] = zext nneg i32 [[LO]] to i64 +; CHECK-NEXT: call void @use.i64(i64 [[LO_ZEXT]]) +; CHECK-NEXT: [[LO_SHL:%.*]] = shl nuw nsw i64 [[LO_ZEXT]], 24 +; CHECK-NEXT: [[HI:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[HI_ZEXT:%.*]] = zext nneg i32 [[HI]] to i64 +; CHECK-NEXT: [[HI_SHL:%.*]] = shl nuw nsw i64 [[HI_ZEXT]], 40 +; CHECK-NEXT: [[RES:%.*]] = or disjoint i64 [[LO_SHL]], [[HI_SHL]] +; CHECK-NEXT: ret i64 [[RES]] +; + %lo = and i32 %x, u0x00ff + %lo.zext = zext nneg i32 %lo to i64 + call void @use.i64(i64 %lo.zext) + %lo.shl = shl nuw nsw i64 %lo.zext, 24 + + %hi = lshr i32 %x, 16 + %hi.zext = zext nneg i32 %hi to i64 + %hi.shl = shl nuw nsw i64 %hi.zext, 40 + + %res = or disjoint i64 %lo.shl, %hi.shl + ret i64 %res +} + +define i64 @shl_multi_use_lshr(i32 %x) { +; CHECK-LABEL: define i64 @shl_multi_use_lshr( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X]], 24 +; CHECK-NEXT: [[LO_SHL:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[HI:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: call void @use.i32(i32 [[HI]]) +; CHECK-NEXT: [[HI_ZEXT:%.*]] = zext nneg i32 [[HI]] to i64 +; CHECK-NEXT: [[HI_SHL:%.*]] = shl nuw nsw i64 [[HI_ZEXT]], 40 +; CHECK-NEXT: [[RES:%.*]] = or disjoint i64 [[HI_SHL]], [[LO_SHL]] +; CHECK-NEXT: ret i64 [[RES]] +; + %lo = and i32 %x, u0x00ff + %lo.zext = zext nneg i32 %lo to i64 + %lo.shl = shl nuw nsw i64 %lo.zext, 24 + + %hi = lshr i32 %x, 16 + call void @use.i32(i32 %hi) + %hi.zext = zext nneg i32 %hi to i64 + %hi.shl = shl nuw nsw i64 %hi.zext, 40 + + %res = or disjoint i64 %lo.shl, %hi.shl + ret i64 %res +} + +define i64 @shl_non_disjoint(i32 %x) { +; CHECK-LABEL: define i64 @shl_non_disjoint( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[LO:%.*]] = and i32 [[X]], 16711680 +; CHECK-NEXT: [[LO_ZEXT:%.*]] = zext nneg i32 [[LO]] to i64 +; CHECK-NEXT: [[LO_SHL:%.*]] = shl nuw nsw i64 [[LO_ZEXT]], 24 +; CHECK-NEXT: [[HI:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: call void @use.i32(i32 [[HI]]) +; CHECK-NEXT: [[HI_ZEXT:%.*]] = zext nneg i32 [[HI]] to i64 +; CHECK-NEXT: [[HI_SHL:%.*]] = shl nuw nsw i64 [[HI_ZEXT]], 40 +; CHECK-NEXT: [[RES:%.*]] = or i64 [[LO_SHL]], [[HI_SHL]] +; CHECK-NEXT: ret i64 [[RES]] +; + %lo = and i32 %x, u0x00ff0000 + %lo.zext = zext nneg i32 %lo to i64 + %lo.shl = shl nuw nsw i64 %lo.zext, 24 + + %hi = lshr i32 %x, 16 + call void @use.i32(i32 %hi) + %hi.zext = zext nneg i32 %hi to i64 + %hi.shl = shl nuw nsw i64 %hi.zext, 40 + + %res = or i64 %lo.shl, %hi.shl + ret i64 %res +} + +define i64 @combine(i32 %lower, i32 %upper) { +; CHECK-LABEL: define i64 @combine( +; CHECK-SAME: i32 [[LOWER:%.*]], i32 [[UPPER:%.*]]) { +; CHECK-NEXT: [[BASE:%.*]] = zext i32 [[LOWER]] to i64 +; CHECK-NEXT: [[UPPER_ZEXT:%.*]] = zext i32 [[UPPER]] to i64 +; CHECK-NEXT: [[S_0:%.*]] = shl nuw i64 [[UPPER_ZEXT]], 32 +; CHECK-NEXT: [[O_3:%.*]] = or disjoint i64 [[S_0]], [[BASE]] +; CHECK-NEXT: ret i64 [[O_3]] +; + %base = zext i32 %lower to i64 + + %u.0 = and i32 %upper, u0xff + %z.0 = zext i32 %u.0 to i64 + %s.0 = shl i64 %z.0, 32 + %o.0 = or i64 %base, %s.0 + + %r.1 = lshr i32 %upper, 8 + %u.1 = and i32 %r.1, u0xff + %z.1 = zext i32 %u.1 to i64 + %s.1 = shl i64 %z.1, 40 + %o.1 = or i64 %o.0, %s.1 + + %r.2 = lshr i32 %upper, 16 + %u.2 = and i32 %r.2, u0xff + %z.2 = zext i32 %u.2 to i64 + %s.2 = shl i64 %z.2, 48 + %o.2 = or i64 %o.1, %s.2 + + %r.3 = lshr i32 %upper, 24 + %u.3 = and i32 %r.3, u0xff + %z.3 = zext i32 %u.3 to i64 + %s.3 = shl i64 %z.3, 56 + %o.3 = or i64 %o.2, %s.3 + + ret i64 %o.3 +} + +define i64 @combine_2(i32 %lower, i32 %upper) { +; CHECK-LABEL: define i64 @combine_2( +; CHECK-SAME: i32 [[LOWER:%.*]], i32 [[UPPER:%.*]]) { +; CHECK-NEXT: [[BASE:%.*]] = zext i32 [[LOWER]] to i64 +; CHECK-NEXT: [[S_03:%.*]] = zext i32 [[UPPER]] to i64 +; CHECK-NEXT: [[O:%.*]] = shl nuw i64 [[S_03]], 32 +; CHECK-NEXT: [[RES:%.*]] = or disjoint i64 [[O]], [[BASE]] +; CHECK-NEXT: ret i64 [[RES]] +; + %base = zext i32 %lower to i64 + + %u.0 = and i32 %upper, u0xff + %z.0 = zext i32 %u.0 to i64 + %s.0 = shl i64 %z.0, 32 + + %r.1 = lshr i32 %upper, 8 + %u.1 = and i32 %r.1, u0xff + %z.1 = zext i32 %u.1 to i64 + %s.1 = shl i64 %z.1, 40 + %o.1 = or i64 %s.0, %s.1 + + %r.2 = lshr i32 %upper, 16 + %u.2 = and i32 %r.2, u0xff + %z.2 = zext i32 %u.2 to i64 + %s.2 = shl i64 %z.2, 48 + + %r.3 = lshr i32 %upper, 24 + %u.3 = and i32 %r.3, u0xff + %z.3 = zext i32 %u.3 to i64 + %s.3 = shl i64 %z.3, 56 + %o.3 = or i64 %s.2, %s.3 + + %o = or i64 %o.1, %o.3 + %res = or i64 %o, %base + + ret i64 %res +} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 798cc1543c023..7af717d3fa1f7 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-LABEL: @uadd_sat_v2i16( @@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -47,7 +47,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -82,7 +82,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -117,7 +117,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { ; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: ret <2 x i16> [[INS_1]] ; @@ -152,7 +152,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -177,7 +177,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -202,7 +202,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -227,7 +227,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 ; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) ; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) -; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0 ; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 ; GCN-NEXT: ret <2 x i32> [[INS_1]] ; @@ -255,7 +255,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX7-NEXT: ret <3 x i16> [[INS_2]] @@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; @@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX9-NEXT: bb: ; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) ; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> ; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX9-NEXT: ret <3 x i16> [[INS_2]] ; @@ -315,7 +309,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) ; GFX7-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 @@ -323,27 +317,19 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) +; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) ; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]]) -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> ; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: -; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]]) +; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) +; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) ; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> -; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]]) -; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> ; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> ; GFX9-NEXT: ret <4 x i16> [[INS_31]] ; @@ -382,7 +368,7 @@ define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1, ptr addrspace(1) ; GFX7-NEXT: [[ADD_1:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_1]], i8 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_2]], i8 [[ARG1_2]]) ; GFX7-NEXT: [[ADD_3:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[ARG0_3]], i8 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> undef, i8 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> poison, i8 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i8> [[INS_0]], i8 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i8> [[INS_1]], i8 [[ADD_2]], i64 2 ; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i8> [[INS_2]], i8 [[ADD_3]], i64 3 @@ -432,7 +418,7 @@ define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { ; GFX7-NEXT: [[ADD_1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_1]], i8 [[ARG1_1]]) ; GFX7-NEXT: [[ADD_2:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_2]], i8 [[ARG1_2]]) ; GFX7-NEXT: [[ADD_3:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[ARG0_3]], i8 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> undef, i8 [[ADD_0]], i64 0 +; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i8> poison, i8 [[ADD_0]], i64 0 ; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i8> [[INS_0]], i8 [[ADD_1]], i64 1 ; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i8> [[INS_1]], i8 [[ADD_2]], i64 2 ; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i8> [[INS_2]], i8 [[ADD_3]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll index 8d518c538a2a3..bc41ff2e1ae06 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll @@ -2,6 +2,7 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX7 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=VECI8 %s define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) { ; GCN-LABEL: @vectorizePHI( @@ -94,6 +95,24 @@ define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace ; GFX8PLUS-NEXT: store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16 ; GFX8PLUS-NEXT: ret void ; +; VECI8-LABEL: @phi( +; VECI8-NEXT: entry: +; VECI8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; VECI8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: br label [[DO_BODY:%.*]] +; VECI8: do.body: +; VECI8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[DO_BODY]] ] +; VECI8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: store <16 x i8> [[TMP4]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; VECI8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; VECI8-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; VECI8: exit: +; VECI8-NEXT: store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 16 +; VECI8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1:%.*]], align 16 +; VECI8-NEXT: ret void +; entry: %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0 %ele0 = load i8, ptr addrspace(3) %gep0, align 8 @@ -225,6 +244,29 @@ define protected amdgpu_kernel void @arith_phi(ptr addrspace(3) %inptr0, ptr %ou ; GFX8PLUS-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2 ; GFX8PLUS-NEXT: ret void ; +; VECI8-LABEL: @arith_phi( +; VECI8-NEXT: entry: +; VECI8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; VECI8-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; VECI8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; VECI8-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; VECI8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; VECI8-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; VECI8: bb.1: +; VECI8-NEXT: [[TMP1:%.*]] = add <4 x i8> [[TMP0]], splat (i8 1) +; VECI8-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> +; VECI8-NEXT: br label [[EXIT]] +; VECI8: exit: +; VECI8-NEXT: [[TMP3:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB_1]] ] +; VECI8-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; VECI8-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; VECI8-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; VECI8-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; VECI8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 2 +; VECI8-NEXT: ret void +; entry: %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0 %ele0 = load i8, ptr addrspace(3) %gep0, align 8 @@ -355,6 +397,30 @@ define protected amdgpu_kernel void @arith(<16 x i8> %invec, ptr %out, i32 %flag ; GFX8PLUS-NEXT: store <16 x i8> [[VECINS15]], ptr [[OUT:%.*]], align 16 ; GFX8PLUS-NEXT: ret void ; +; VECI8-LABEL: @arith( +; VECI8-NEXT: entry: +; VECI8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1) +; VECI8-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1) +; VECI8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1) +; VECI8-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1) +; VECI8-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1) +; VECI8-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1) +; VECI8-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; VECI8-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1) +; VECI8-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1) +; VECI8-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; VECI8-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> +; VECI8-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; VECI8-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> +; VECI8-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT:%.*]], align 16 +; VECI8-NEXT: ret void +; entry: %el0 = extractelement <16 x i8> %invec, i64 0 %el1 = extractelement <16 x i8> %invec, i64 1 diff --git a/llvm/test/tools/llvm-objcopy/ELF/dump-offload-bundle.test b/llvm/test/tools/llvm-objcopy/ELF/dump-offload-bundle.test new file mode 100644 index 0000000000000..518cdace8e29c --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/ELF/dump-offload-bundle.test @@ -0,0 +1,60 @@ +## Test that --offloading with a fatbin works correctly +# REQUIRES: target={{x86_64-.*-linux.*}} +# REQUIRES: amdgpu-registered-target + +# RUN: yaml2obj %s -o %t.elf +# RUN: llvm-objcopy --dump-offload-bundle=file://%t.elf#offset=8192\&size=4048 +# RUN: llvm-objdump -d %t.elf-offset8192-size4048.co | FileCheck %s + +# CHECK: s_load_dword s7, s[4:5], 0x24 // 000000001900: C00201C2 00000024 +# CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 // 000000001908: C00A0002 00000000 +# CHECK-NEXT: v_mov_b32_e32 v1, 0 // 000000001910: 7E020280 +# CHECK-NEXT: s_waitcnt lgkmcnt(0) // 000000001914: BF8CC07F +# CHECK-NEXT: s_and_b32 s4, s7, 0xffff // 000000001918: 8604FF07 0000FFFF +# CHECK-NEXT: s_mul_i32 s6, s6, s4 // 000000001920: 92060406 +# CHECK-NEXT: v_add_u32_e32 v0, s6, v0 // 000000001924: 68000006 +# CHECK-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] // 000000001928: D28F0000 00020082 +# CHECK-NEXT: v_mov_b32_e32 v3, s3 // 000000001930: 7E060203 +# CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 // 000000001934: 32040002 +# CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc // 000000001938: 38060303 +# CHECK-NEXT: global_load_dword v2, v[2:3], off // 00000000193C: DC508000 027F0002 +# CHECK-NEXT: v_mov_b32_e32 v3, s1 // 000000001944: 7E060201 +# CHECK-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 // 000000001948: 32000000 +# CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc // 00000000194C: 38020303 +# CHECK-NEXT: global_load_dword v3, v[0:1], off // 000000001950: DC508000 037F0000 +# CHECK-NEXT: s_waitcnt vmcnt(0) // 000000001958: BF8C0F70 +# CHECK-NEXT: v_add_u32_e32 v2, v3, v2 // 00000000195C: 68040503 +# CHECK-NEXT: global_store_dword v[0:1], v2, off // 000000001960: DC708000 007F0200 +# CHECK-NEXT: s_endpgm // 000000001968: BF810000 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x2041B0 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x200040 + Align: 0x8 + Offset: 0x40 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x0 + Offset: 0x0 +Sections: + - Name: .hip_fatbin + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x201000 + AddressAlign: 0x1000 + Content: 5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000 + - Name: .hipFatBinSegment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x202FD0 + AddressAlign: 0x8 + Content: '465049480100000000102000000000000000000000000000' +... diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test new file mode 100644 index 0000000000000..9656172f2941c --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test @@ -0,0 +1,42 @@ +## Test that --offloading with a fatbin works correctly +# REQUIRES: target={{x86_64-.*-linux.*}} +# REQUIRES: amdgpu-registered-target + +# RUN: yaml2obj %s -o %t.elf +# RUN: llvm-readobj --offloading %t.elf > %t.out +# RUN: FileCheck %s --input-file=%t.out -DFILE_NAME=%t.elf + +# CHECK: host-x86_64-unknown-linux-- file://[[FILE_NAME]]#offset=8192&size=0 +# CHECK-NEXT: hipv4-amdgcn-amd-amdhsa--gfx908 file://[[FILE_NAME]]#offset=8192&size=4048 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 + Entry: 0x2041B0 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x200040 + Align: 0x8 + Offset: 0x40 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x0 + Offset: 0x0 +Sections: + - Name: .hip_fatbin + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x201000 + AddressAlign: 0x1000 + Content: 5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000 + - Name: .hipFatBinSegment + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x202FD0 + AddressAlign: 0x8 + Content: '465049480100000000102000000000000000000000000000' +... diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp index 0d209590655ef..94e79ca6491ae 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp @@ -16,6 +16,8 @@ #include "llvm/ObjCopy/ConfigManager.h" #include "llvm/ObjCopy/MachO/MachOConfig.h" #include "llvm/Object/Binary.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/CRC.h" @@ -284,6 +286,11 @@ static Expected parseVisibilityType(StringRef VisType) { return type; } +static void llvm::objcopy::parseDumpOffloadBundle(StringRef URI) { + if (Error Err = object::extractOffloadBundleByURI(URI)) + outs() << "Failed to extract from URI."; +} + namespace { struct TargetInfo { FileFormat Format; @@ -727,6 +734,15 @@ objcopy::parseObjcopyOptions(ArrayRef ArgsArr, SmallVector Positional; + ConfigManager ConfigMgr; + CommonConfig &Config = ConfigMgr.Common; + COFFConfig &COFFConfig = ConfigMgr.COFF; + ELFConfig &ELFConfig = ConfigMgr.ELF; + MachOConfig &MachOConfig = ConfigMgr.MachO; + + if (InputArgs.hasArg(OBJCOPY_dump_offload_bundle)) + Config.NeedPositional = false; + for (auto *Arg : InputArgs.filtered(OBJCOPY_UNKNOWN)) return createStringError(errc::invalid_argument, "unknown argument '%s'", Arg->getAsString(InputArgs).c_str()); @@ -734,27 +750,29 @@ objcopy::parseObjcopyOptions(ArrayRef ArgsArr, for (auto *Arg : InputArgs.filtered(OBJCOPY_INPUT)) Positional.push_back(Arg->getValue()); - if (Positional.empty()) + if (Positional.empty() && Config.NeedPositional) return createStringError(errc::invalid_argument, "no input file specified"); - if (Positional.size() > 2) + if (Positional.size() > 2 && Config.NeedPositional) return createStringError(errc::invalid_argument, "too many positional arguments"); - ConfigManager ConfigMgr; - CommonConfig &Config = ConfigMgr.Common; - COFFConfig &COFFConfig = ConfigMgr.COFF; - ELFConfig &ELFConfig = ConfigMgr.ELF; - MachOConfig &MachOConfig = ConfigMgr.MachO; - Config.InputFilename = Positional[0]; - Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; - if (InputArgs.hasArg(OBJCOPY_target) && - (InputArgs.hasArg(OBJCOPY_input_target) || - InputArgs.hasArg(OBJCOPY_output_target))) - return createStringError( - errc::invalid_argument, - "--target cannot be used with --input-target or --output-target"); + if (Arg *A = InputArgs.getLastArg(OBJCOPY_dump_offload_bundle)) { + for (StringRef URIStr : llvm::split(A->getValue(), ",")) { + llvm::objcopy::parseDumpOffloadBundle(URIStr); + } + } + if (Config.NeedPositional) { + Config.InputFilename = Positional[0]; + Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; + if (InputArgs.hasArg(OBJCOPY_target) && + (InputArgs.hasArg(OBJCOPY_input_target) || + InputArgs.hasArg(OBJCOPY_output_target))) + return createStringError( + errc::invalid_argument, + "--target cannot be used with --input-target or --output-target"); + } if (InputArgs.hasArg(OBJCOPY_regex) && InputArgs.hasArg(OBJCOPY_wildcard)) return createStringError(errc::invalid_argument, "--regex and --wildcard are incompatible"); @@ -1417,25 +1435,26 @@ objcopy::parseInstallNameToolOptions(ArrayRef ArgsArr) { Arg->getAsString(InputArgs).c_str()); for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_INPUT)) Positional.push_back(Arg->getValue()); - if (Positional.empty()) + if (Positional.empty() && Config.NeedPositional) return createStringError(errc::invalid_argument, "no input file specified"); - if (Positional.size() > 1) + if (Positional.size() > 1 && Config.NeedPositional) return createStringError( errc::invalid_argument, "llvm-install-name-tool expects a single input file"); - Config.InputFilename = Positional[0]; - Config.OutputFilename = Positional[0]; - - Expected> BinaryOrErr = - createBinary(Config.InputFilename); - if (!BinaryOrErr) - return createFileError(Config.InputFilename, BinaryOrErr.takeError()); - auto *Binary = (*BinaryOrErr).getBinary(); - if (!Binary->isMachO() && !Binary->isMachOUniversalBinary()) - return createStringError(errc::invalid_argument, - "input file: %s is not a Mach-O file", - Config.InputFilename.str().c_str()); - + if (Config.NeedPositional) { + Config.InputFilename = Positional[0]; + Config.OutputFilename = Positional[0]; + + Expected> BinaryOrErr = + createBinary(Config.InputFilename); + if (!BinaryOrErr) + return createFileError(Config.InputFilename, BinaryOrErr.takeError()); + auto *Binary = (*BinaryOrErr).getBinary(); + if (!Binary->isMachO() && !Binary->isMachOUniversalBinary()) + return createStringError(errc::invalid_argument, + "input file: %s is not a Mach-O file", + Config.InputFilename.str().c_str()); + } DC.CopyConfigs.push_back(std::move(ConfigMgr)); return std::move(DC); } @@ -1474,13 +1493,16 @@ objcopy::parseBitcodeStripOptions(ArrayRef ArgsArr, Arg->getAsString(InputArgs).c_str()); SmallVector Positional; - for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT)) - Positional.push_back(Arg->getValue()); - if (Positional.size() > 1) - return createStringError(errc::invalid_argument, - "llvm-bitcode-strip expects a single input file"); - assert(!Positional.empty()); - Config.InputFilename = Positional[0]; + if (Config.NeedPositional) { + for (auto *Arg : InputArgs.filtered(BITCODE_STRIP_INPUT)) + Positional.push_back(Arg->getValue()); + if (Positional.size() > 1) + return createStringError( + errc::invalid_argument, + "llvm-bitcode-strip expects a single input file"); + assert(!Positional.empty()); + Config.InputFilename = Positional[0]; + } if (!InputArgs.hasArg(BITCODE_STRIP_output)) { return createStringError(errc::invalid_argument, @@ -1542,27 +1564,31 @@ objcopy::parseStripOptions(ArrayRef RawArgsArr, exit(0); } - SmallVector Positional; - for (auto *Arg : InputArgs.filtered(STRIP_UNKNOWN)) - return createStringError(errc::invalid_argument, "unknown argument '%s'", - Arg->getAsString(InputArgs).c_str()); - for (auto *Arg : InputArgs.filtered(STRIP_INPUT)) - Positional.push_back(Arg->getValue()); - std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional)); - - if (Positional.empty()) - return createStringError(errc::invalid_argument, "no input file specified"); - - if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output)) - return createStringError( - errc::invalid_argument, - "multiple input files cannot be used in combination with -o"); - ConfigManager ConfigMgr; CommonConfig &Config = ConfigMgr.Common; ELFConfig &ELFConfig = ConfigMgr.ELF; MachOConfig &MachOConfig = ConfigMgr.MachO; + SmallVector Positional; + if (Config.NeedPositional) { + for (auto *Arg : InputArgs.filtered(STRIP_UNKNOWN)) + return createStringError(errc::invalid_argument, "unknown argument '%s'", + Arg->getAsString(InputArgs).c_str()); + for (auto *Arg : InputArgs.filtered(STRIP_INPUT)) + Positional.push_back(Arg->getValue()); + std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional)); + + if (Positional.empty()) + return createStringError(errc::invalid_argument, + "no input file specified"); + + if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output)) { + return createStringError( + errc::invalid_argument, + "multiple input files cannot be used in combination with -o"); + } + } + if (InputArgs.hasArg(STRIP_regex) && InputArgs.hasArg(STRIP_wildcard)) return createStringError(errc::invalid_argument, "--regex and --wildcard are incompatible"); diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.h b/llvm/tools/llvm-objcopy/ObjcopyOptions.h index 3b8878981da47..72b2171e0a477 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.h +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.h @@ -51,6 +51,11 @@ parseBitcodeStripOptions(ArrayRef ArgsArr, Expected parseStripOptions(ArrayRef ArgsArr, llvm::function_ref ErrorCallback); + +// parseDumpURI reads a URI as a string, and extracts the raw memory into a +// code object file named from the URI string given +static void parseDumpOffloadBundle(StringRef URI); + } // namespace objcopy } // namespace llvm diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td index fbc6a59d9461e..c6216a6b8a627 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td +++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td @@ -239,6 +239,9 @@ defm dump_section : Eq<"dump-section", "Dump contents of section named
into file ">, MetaVarName<"section=file">; + +defm dump_offload_bundle : Eq<"dump-offload-bundle", "Dump the contents specified by URI">; + defm prefix_symbols : Eq<"prefix-symbols", "Add to the start of every symbol name">, MetaVarName<"prefix">; diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp index ad67b673b2cc7..7d6395e98c2c2 100644 --- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp @@ -124,88 +124,89 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr, static Error executeObjcopy(ConfigManager &ConfigMgr) { CommonConfig &Config = ConfigMgr.Common; - Expected PermsApplierOrErr = - FilePermissionsApplier::create(Config.InputFilename); - if (!PermsApplierOrErr) - return PermsApplierOrErr.takeError(); - - std::function ObjcopyFunc; - - OwningBinary BinaryHolder; - std::unique_ptr MemoryBufferHolder; - - if (Config.InputFormat == FileFormat::Binary || - Config.InputFormat == FileFormat::IHex) { - ErrorOr> BufOrErr = - MemoryBuffer::getFileOrSTDIN(Config.InputFilename); - if (!BufOrErr) - return createFileError(Config.InputFilename, BufOrErr.getError()); - MemoryBufferHolder = std::move(*BufOrErr); - - if (Config.InputFormat == FileFormat::Binary) - ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { - // Handle FileFormat::Binary. - return executeObjcopyOnRawBinary(ConfigMgr, *MemoryBufferHolder, - OutFile); - }; - else - ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { - // Handle FileFormat::IHex. - return executeObjcopyOnIHex(ConfigMgr, *MemoryBufferHolder, OutFile); - }; - } else { - Expected> BinaryOrErr = - createBinary(Config.InputFilename); - if (!BinaryOrErr) - return createFileError(Config.InputFilename, BinaryOrErr.takeError()); - BinaryHolder = std::move(*BinaryOrErr); - - if (Archive *Ar = dyn_cast(BinaryHolder.getBinary())) { - // Handle Archive. - if (Error E = executeObjcopyOnArchive(ConfigMgr, *Ar)) - return E; + if (Config.NeedPositional) { + Expected PermsApplierOrErr = + FilePermissionsApplier::create(Config.InputFilename); + if (!PermsApplierOrErr) + return PermsApplierOrErr.takeError(); + + std::function ObjcopyFunc; + + OwningBinary BinaryHolder; + std::unique_ptr MemoryBufferHolder; + + if (Config.InputFormat == FileFormat::Binary || + Config.InputFormat == FileFormat::IHex) { + ErrorOr> BufOrErr = + MemoryBuffer::getFileOrSTDIN(Config.InputFilename); + if (!BufOrErr) + return createFileError(Config.InputFilename, BufOrErr.getError()); + MemoryBufferHolder = std::move(*BufOrErr); + + if (Config.InputFormat == FileFormat::Binary) { + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + // Handle FileFormat::Binary. + return executeObjcopyOnRawBinary(ConfigMgr, *MemoryBufferHolder, + OutFile); + }; + } else + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + // Handle FileFormat::IHex. + return executeObjcopyOnIHex(ConfigMgr, *MemoryBufferHolder, OutFile); + }; } else { - // Handle llvm::object::Binary. - ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { - return executeObjcopyOnBinary(ConfigMgr, *BinaryHolder.getBinary(), - OutFile); - }; + Expected> BinaryOrErr = + createBinary(Config.InputFilename); + if (!BinaryOrErr) + return createFileError(Config.InputFilename, BinaryOrErr.takeError()); + BinaryHolder = std::move(*BinaryOrErr); + + if (Archive *Ar = dyn_cast(BinaryHolder.getBinary())) { + // Handle Archive. + if (Error E = executeObjcopyOnArchive(ConfigMgr, *Ar)) + return E; + } else { + // Handle llvm::object::Binary. + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + return executeObjcopyOnBinary(ConfigMgr, *BinaryHolder.getBinary(), + OutFile); + }; + } } - } - if (ObjcopyFunc) { - if (Config.SplitDWO.empty()) { - // Apply transformations described by Config and store result into - // Config.OutputFilename using specified ObjcopyFunc function. - if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) - return E; - } else { - Config.ExtractDWO = true; - Config.StripDWO = false; - // Copy .dwo tables from the Config.InputFilename into Config.SplitDWO - // file using specified ObjcopyFunc function. - if (Error E = writeToOutput(Config.SplitDWO, ObjcopyFunc)) - return E; - Config.ExtractDWO = false; - Config.StripDWO = true; - // Apply transformations described by Config, remove .dwo tables and - // store result into Config.OutputFilename using specified ObjcopyFunc - // function. - if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) - return E; + if (ObjcopyFunc) { + if (Config.SplitDWO.empty()) { + // Apply transformations described by Config and store result into + // Config.OutputFilename using specified ObjcopyFunc function. + if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) + return E; + } else { + Config.ExtractDWO = true; + Config.StripDWO = false; + // Copy .dwo tables from the Config.InputFilename into Config.SplitDWO + // file using specified ObjcopyFunc function. + if (Error E = writeToOutput(Config.SplitDWO, ObjcopyFunc)) + return E; + Config.ExtractDWO = false; + Config.StripDWO = true; + // Apply transformations described by Config, remove .dwo tables and + // store result into Config.OutputFilename using specified ObjcopyFunc + // function. + if (Error E = writeToOutput(Config.OutputFilename, ObjcopyFunc)) + return E; + } } - } - - if (Error E = - PermsApplierOrErr->apply(Config.OutputFilename, Config.PreserveDates)) - return E; - if (!Config.SplitDWO.empty()) - if (Error E = - PermsApplierOrErr->apply(Config.SplitDWO, Config.PreserveDates, - static_cast(0666))) + if (Error E = PermsApplierOrErr->apply(Config.OutputFilename, + Config.PreserveDates)) return E; + if (!Config.SplitDWO.empty()) + if (Error E = + PermsApplierOrErr->apply(Config.SplitDWO, Config.PreserveDates, + static_cast(0666))) + return E; + } return Error::success(); } diff --git a/llvm/tools/llvm-objdump/OffloadDump.cpp b/llvm/tools/llvm-objdump/OffloadDump.cpp index 0eb45ef5de915..834508fd5a572 100644 --- a/llvm/tools/llvm-objdump/OffloadDump.cpp +++ b/llvm/tools/llvm-objdump/OffloadDump.cpp @@ -88,21 +88,30 @@ void llvm::dumpOffloadBundleFatBinary(const ObjectFile &O, StringRef ArchName) { if (Error Err = llvm::object::extractOffloadBundleFatBinary(O, FoundBundles)) reportError(O.getFileName(), "while extracting offload FatBin bundles: " + toString(std::move(Err))); - for (const auto &[BundleNum, Bundle] : llvm::enumerate(FoundBundles)) { for (OffloadBundleEntry &Entry : Bundle.getEntries()) { - if (!ArchName.empty() && !Entry.ID.contains(ArchName)) + if (!ArchName.empty() && (Entry.ID.find(ArchName) != std::string::npos)) continue; // create file name for this object file: .. - std::string str = Bundle.getFileName().str() + "." + itostr(BundleNum) + - "." + Entry.ID.str(); - if (Error Err = object::extractCodeObject(O, Entry.Offset, Entry.Size, - StringRef(str))) - reportError(O.getFileName(), - "while extracting offload Bundle Entries: " + - toString(std::move(Err))); + std::string str = + Bundle.getFileName().str() + "." + itostr(BundleNum) + "." + Entry.ID; + + if (Bundle.isDecompressed()) { + if (Error Err = object::extractCodeObject( + Bundle.DecompressedBuffer->getMemBufferRef(), Entry.Offset, + Entry.Size, StringRef(str))) + reportError(O.getFileName(), + "while extracting offload Bundle Entries: " + + toString(std::move(Err))); + } else { + if (Error Err = object::extractCodeObject(O, Entry.Offset, Entry.Size, + StringRef(str))) + reportError(O.getFileName(), + "while extracting offload Bundle Entries: " + + toString(std::move(Err))); + } outs() << "Extracting offload bundle: " << str << "\n"; } } diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp index 1a535ede07096..5428baf070912 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.cpp +++ b/llvm/tools/llvm-readobj/ObjDumper.cpp @@ -16,6 +16,8 @@ #include "llvm/Object/Archive.h" #include "llvm/Object/Decompressor.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Object/OffloadBundle.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/ScopedPrinter.h" @@ -230,4 +232,21 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj, } } +// TODO: add proper error handling. +void ObjDumper::printOffloading(const object::ObjectFile &Obj) { + // we can use an argument to let user select which offloading section they + // want to print. but for now, we're hardcoding ELF and "hip_fatbin". + assert((Obj.isELF() || Obj.isCOFF()) && "Invalid file type"); + + SmallVector Bundles; + if (Error Err = llvm::object::extractOffloadBundleFatBinary(Obj, Bundles)) + reportWarning(createError("Cannot extract Fatbin Binary from Object."), + Obj.getFileName()); + + // Print out all the FatBin Bundles that are contained in this buffer. + for (const auto &[Index, Bundle] : llvm::enumerate(Bundles)) { + Bundle.printEntriesAsURI(); + } +} + } // namespace llvm diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h index a76afbe9c88c7..ebb1069402b35 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.h +++ b/llvm/tools/llvm-readobj/ObjDumper.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/OffloadBinary.h" #include "llvm/Support/CommandLine.h" #include @@ -186,6 +187,7 @@ class ObjDumper { std::function WarningHandler; void reportUniqueWarning(Error Err) const; void reportUniqueWarning(const Twine &Msg) const; + void printOffloading(const object::ObjectFile &Obj); protected: ScopedPrinter &W; diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td index f95461aaca1a7..4708079805227 100644 --- a/llvm/tools/llvm-readobj/Opts.td +++ b/llvm/tools/llvm-readobj/Opts.td @@ -64,6 +64,9 @@ def notes : FF<"notes", "Display notes">, Group; def program_headers : FF<"program-headers", "Display program headers">, Group; def version_info : FF<"version-info", "Display version sections">, Group; +def offloading : Flag<["--"], "offloading">, + HelpText<"Display the content of the offloading section">; + // Mach-O specific options. def grp_mach_o : OptionGroup<"kind">, HelpText<"OPTIONS (Mach-O specific)">; def macho_data_in_code : FF<"macho-data-in-code", "Display Data in Code command">, Group; diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp index 1231c02035d1f..73c0ff8d7650b 100644 --- a/llvm/tools/llvm-readobj/llvm-readobj.cpp +++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp @@ -138,6 +138,7 @@ static bool Notes; static bool ProgramHeaders; static bool SectionGroups; static bool VersionInfo; +static bool Offloading; // Mach-O specific options. static bool MachODataInCode; @@ -288,6 +289,7 @@ static void parseOptions(const opt::InputArgList &Args) { } } opts::VersionInfo = Args.hasArg(OPT_version_info); + opts::Offloading = Args.hasArg(OPT_offloading); // Mach-O specific options. opts::MachODataInCode = Args.hasArg(OPT_macho_data_in_code); @@ -455,6 +457,8 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer, Dumper->printGnuHashTable(); if (opts::VersionInfo) Dumper->printVersionInfo(); + if (opts::Offloading) + Dumper->printOffloading(Obj); if (opts::StringTable) Dumper->printStringTable(); if (Obj.isELF()) { @@ -699,6 +703,7 @@ int llvm_readobj_main(int argc, char **argv, const llvm::ToolContext &) { opts::DynamicTable = true; opts::Notes = true; opts::VersionInfo = true; + opts::Offloading = true; opts::UnwindInfo = true; opts::SectionGroups = true; opts::HashHistogram = true; diff --git a/llvm/unittests/Object/OffloadingBundleTest.cpp b/llvm/unittests/Object/OffloadingBundleTest.cpp index 06d39fb33644e..66e1207efa572 100644 --- a/llvm/unittests/Object/OffloadingBundleTest.cpp +++ b/llvm/unittests/Object/OffloadingBundleTest.cpp @@ -53,8 +53,6 @@ toBinary(SmallVectorImpl &Storage, StringRef Yaml) { TEST(OffloadingBundleTest, checkExtractOffloadBundleFatBinary) { // create a Memory Buffer with a fatbin offloading section - MemoryBufferRef mbuf; - StringRef FileName; SmallVector(); SmallString<0> Storage; // Expected> ObjOrErr = toBinary(Storage, R"( @@ -69,7 +67,6 @@ TEST(OffloadingBundleTest, checkExtractOffloadBundleFatBinary) { TEST(OffloadingBundleTest, checkExtractCodeObject) { // create a Memory Buffer with a fatbin offloading section - MemoryBufferRef mbuf; SmallVector(); SmallString<0> Storage; // Expected> ObjOrErr = toBinary(Storage, R"( diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 3901fbee4d6d6..f6b0d87ba563d 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -113,18 +113,13 @@ function(compileDeviceRTLLibrary target_name target_triple) set(target_bc_flags ${ARGN}) if(${target_name} MATCHES "amdgpu") - set(amdbc_dirs - "${CMAKE_BINARY_DIR}/../rocm-device-libs-prefix/src/rocm-device-libs-build/lib/llvm/lib/clang/${LLVM_VERSION_MAJOR}/lib/amdgcn/bitcode" - "${CMAKE_INSTALL_PREFIX}/../../amdgcn/bitcode" - "/opt/rocm/amdgcn/bitcode") - foreach(amdbc_dir ${amdbc_dirs}) - if(EXISTS "${amdbc_dir}/ockl.bc" AND NOT _ockl_bc) - set(_ockl_bc ${amdbc_dir}/ockl.bc) - endif() - if(EXISTS "${amdbc_dir}/ocml.bc" AND NOT _ocml_bc) - set(_ocml_bc ${amdbc_dir}/ocml.bc) - endif() - endforeach() + find_package(AMDDeviceLibs REQUIRED CONFIG + HINTS ${CMAKE_BINARY_DIR}/../../tools/rocm-device-libs + ${CMAKE_BINARY_DIR}/../rocm-device-libs-prefix/src/rocm-device-libs-build + ${CMAKE_INSTALL_PREFIX} + ) + get_target_property(_ocml_bc ocml IMPORTED_LOCATION) + get_target_property(_ockl_bc ockl IMPORTED_LOCATION) if(NOT _ockl_bc) message(FATAL_ERROR "Could not find ockl.bc") endif() diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 20fdf3c0be753..3cc4eed3dfed8 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -298,9 +298,9 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { ASSERT(HasThreadState == Other.HasThreadState, nullptr); } -[[clang::loader_uninitialized]] Local +[[clang::loader_uninitialized, gnu::used, gnu::retain]] Local ompx::state::TeamState; -[[clang::loader_uninitialized]] Local +[[clang::loader_uninitialized, gnu::used, gnu::retain]] Local ompx::state::ThreadStates; namespace { diff --git a/openmp/libompd/gdb-plugin/ompdModule.c b/openmp/libompd/gdb-plugin/ompdModule.c index 9776a3ecccd58..c3153751c85e3 100644 --- a/openmp/libompd/gdb-plugin/ompdModule.c +++ b/openmp/libompd/gdb-plugin/ompdModule.c @@ -941,7 +941,8 @@ static PyObject *call_ompd_get_enclosing_parallel_handle(PyObject *self, if (retVal != ompd_rc_ok) { _printf("An error occurred when calling " - "ompd_get_enclosing_parallel_handle! Error code: %d", + "ompd_get_enclosing_parallel_handle!" + "Error code: %d", retVal); return Py_BuildValue("l", retVal); } @@ -965,7 +966,7 @@ static PyObject *call_ompd_get_task_parallel_handle(PyObject *self, if (retVal != ompd_rc_ok) { _printf("An error occurred when calling ompd_get_task_parallel_handle! " - "Error code: %d"); + "Error code: %d", retVal); return Py_BuildValue("l", retVal); } return PyCapsule_New(taskParallelHandle, "ParallelHandle", @@ -1178,9 +1179,10 @@ static PyObject *call_ompd_get_icv_from_scope(PyObject *self, PyObject *args) { if (retVal != ompd_rc_ok) { if (retVal != ompd_rc_incomplete) { - _printf("An error occurred when calling ompd_get_icv_from_scope(%i, %i): " - "Error code: %d", - scope, icvId, retVal); + _printf( + "An error occurred when calling ompd_get_icv_from_scope(%i, %" PRIu64 + "): Error code: %d", + scope, icvId, retVal); } return Py_None; } diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index ed7fac1c88956..b879b0e2d8eee 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -3274,9 +3274,9 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name, int eff = __kmp_affinity.core_attr_gran.core_eff; if (ct != KMP_HW_CORE_TYPE_UNKNOWN) { const char *ct_name = __kmp_hw_get_core_type_keyword(ct); - __kmp_str_buf_print(buffer, ":%s", name, ct_name); + __kmp_str_buf_print(buffer, ":%s", ct_name); } else if (eff >= 0 && eff < KMP_HW_MAX_NUM_CORE_EFFS) { - __kmp_str_buf_print(buffer, ":eff%d", name, eff); + __kmp_str_buf_print(buffer, ":eff%d", eff); } }