From 8734655ed3b8d7ab0990936d775e3be291933218 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Tue, 22 Apr 2025 12:13:05 +0000 Subject: [PATCH 01/33] [release/2.8] Enable wheels (cherry picked from commit e294d4d6fb7f0552e430fd38b2acf864c8e051f2 with modifications for release/2.8) Reintroduce CIRCLE_TAG to be able to set PYTORCH_BUILD_VERSION without date --- .circleci/scripts/binary_populate_env.sh | 7 ++++- .github/scripts/build_triton_wheel.py | 33 +++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 7f89c5c2dd8e6..b2df131ec33cb 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -5,7 +5,9 @@ export TZ=UTC tagged_version() { GIT_DIR="${workdir}/pytorch/.git" GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*" - if [[ ! -d "${GIT_DIR}" ]]; then + if [[ -n "${CIRCLE_TAG:-}" ]]; then + echo "${CIRCLE_TAG}" + elif [[ ! -d "${GIT_DIR}" ]]; then echo "Abort, abort! Git dir ${GIT_DIR} does not exists!" kill $$ elif ${GIT_DESCRIBE} --exact >/dev/null; then @@ -69,6 +71,8 @@ fi export PYTORCH_BUILD_NUMBER=1 +# This part is done in the builder scripts so commenting the duplicate code +: <<'BLOCK_COMMENT' # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) @@ -117,6 +121,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" fi fi +BLOCK_COMMENT USE_GLOO_WITH_OPENSSL="ON" if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index beec9f96aba21..1302570432046 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os +import re import shutil import sys from pathlib import Path @@ -50,6 +51,30 @@ def patch_init_py( with open(path, "w") as f: f.write(orig) +def get_rocm_version() -> str: + rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm" + rocm_version = "0.0.0" + rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h" + if not os.path.isfile(rocm_version_h): + rocm_version_h = f"{rocm_path}/include/rocm_version.h" + # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install. + if os.path.isfile(rocm_version_h): + RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)") + RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)") + RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)") + major, minor, patch = 0, 0, 0 + for line in open(rocm_version_h): + match = RE_MAJOR.search(line) + if match: + major = int(match.group(1)) + match = RE_MINOR.search(line) + if match: + minor = int(match.group(1)) + match = RE_PATCH.search(line) + if match: + patch = int(match.group(1)) + rocm_version = str(major)+"."+str(minor)+"."+str(patch) + return rocm_version def build_triton( *, @@ -64,7 +89,12 @@ def build_triton( if "MAX_JOBS" not in env: max_jobs = os.cpu_count() or 1 env["MAX_JOBS"] = str(max_jobs) - + if not release: + # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8 + # while release build should only include the version, i.e. 2.1.0 + rocm_version = get_rocm_version() + version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}" + version += version_suffix with TemporaryDirectory() as tmpdir: triton_basedir = Path(tmpdir) / "triton" triton_pythondir = triton_basedir / "python" @@ -88,6 +118,7 @@ def build_triton( # change built wheel name and version env["TRITON_WHEEL_NAME"] = triton_pkg_name + env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix if with_clang_ldd: env["TRITON_BUILD_WITH_CLANG_LLD"] = "1" From dc95b0c29fab05322806bf322b8fd2f442af2e42 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Tue, 15 Jul 2025 23:13:35 +0000 Subject: [PATCH 02/33] Updates to build for Noble (Ubuntu 24.04) and py3.12 (Changes selected from ef226be26bcf41291ad05ff6a51688ae7cc0bb37 and https://github.com/pytorch/pytorch/commit/fadc936fad0793e931ed2eb89577e1d10d212f71) --- .ci/docker/build.sh | 2 ++ .ci/docker/common/common_utils.sh | 4 ++++ .ci/docker/common/install_base.sh | 3 +++ .ci/docker/common/install_conda.sh | 4 ++++ .ci/docker/common/install_rocm.sh | 8 +++++--- .ci/docker/requirements-ci.txt | 3 +++ 6 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 6b978b8f4b552..6624d9928cbe0 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -52,6 +52,8 @@ fi if [[ "$image" == *-jammy* ]]; then UBUNTU_VERSION=22.04 +elif [[ "$image" == *-noble* ]]; then + UBUNTU_VERSION=24.04 elif [[ "$image" == *ubuntu* ]]; then extract_version_from_image_name ubuntu UBUNTU_VERSION fi diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh index 27c1b815a0ea8..110065698b587 100644 --- a/.ci/docker/common/common_utils.sh +++ b/.ci/docker/common/common_utils.sh @@ -23,6 +23,10 @@ conda_install() { as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* } +conda_install_through_forge() { + as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* +} + conda_run() { as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* } diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh index 64304fec6ed9d..7d8ae247d7a0b 100755 --- a/.ci/docker/common/install_base.sh +++ b/.ci/docker/common/install_base.sh @@ -15,6 +15,9 @@ install_ubuntu() { elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then cmake3="cmake=3.22*" maybe_libiomp_dev="" + elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then + cmake3="cmake=3.28*" + maybe_libiomp_dev="" else cmake3="cmake=3.5*" maybe_libiomp_dev="libiomp-dev" diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh index 11c51cac0bf83..b33f7f0a1e9d3 100755 --- a/.ci/docker/common/install_conda.sh +++ b/.ci/docker/common/install_conda.sh @@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) fi + if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then + conda_install_through_forge libstdcxx-ng=14 + fi + # Install some other packages, including those needed for Python test reporting pip_install -r /opt/conda/requirements-ci.txt diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 2a8d5b30e74e3..fe2f35838fd9c 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -8,9 +8,11 @@ ver() { install_ubuntu() { apt-get update - if [[ $UBUNTU_VERSION == 20.04 ]]; then - # gpg-agent is not available by default on 20.04 - apt-get install -y --no-install-recommends gpg-agent + # gpg-agent is not available by default + apt-get install -y --no-install-recommends gpg-agent + if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ + | sudo tee /etc/apt/preferences.d/rocm-pin-600 fi apt-get install -y kmod apt-get install -y wget diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 4ecdde62408de..9f9360fc53270 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -16,6 +16,7 @@ click #test that import: coremltools==5.0b5 ; python_version < "3.12" +coremltools==8.3 ; python_version == "3.12" #Description: Apple framework for ML integration #Pinned versions: 5.0b5 #test that import: @@ -63,6 +64,7 @@ lark==0.12.0 #test that import: librosa>=0.6.2 ; python_version < "3.11" +librosa==0.10.2 ; python_version == "3.12" #Description: A python package for music and audio analysis #Pinned versions: >=0.6.2 #test that import: test_spectral_ops.py @@ -111,6 +113,7 @@ ninja==1.11.1.3 numba==0.49.0 ; python_version < "3.9" numba==0.55.2 ; python_version == "3.9" numba==0.55.2 ; python_version == "3.10" +numba==0.60.0 ; python_version == "3.12" #Description: Just-In-Time Compiler for Numerical Functions #Pinned versions: 0.54.1, 0.49.0, <=0.49.1 #test that import: test_numba_integration.py From b741af3b5be771c131352cc063fe495aa654c3d4 Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Tue, 20 May 2025 02:00:27 -0500 Subject: [PATCH 03/33] [release/2.8] Make triton build ROCm version agnostic Cherry-pick of https://github.com/ROCm/pytorch/pull/2130 Validation: http://rocm-ci.amd.com/job/rocm-pytorch-manylinux-wheel-builder-lw/155/ --------- Co-authored-by: Ethan Wee Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> (cherry picked from commit 2c220b2ee62d4a9391ff5e441bee46477f65849e) --- .github/scripts/amd/package_triton_wheel.sh | 31 +++++---------------- .github/scripts/amd/patch_triton_wheel.sh | 16 +++++------ 2 files changed, 14 insertions(+), 33 deletions(-) diff --git a/.github/scripts/amd/package_triton_wheel.sh b/.github/scripts/amd/package_triton_wheel.sh index 6ecf8bab116b9..fe8d915422dac 100755 --- a/.github/scripts/amd/package_triton_wheel.sh +++ b/.github/scripts/amd/package_triton_wheel.sh @@ -1,3 +1,4 @@ +#!/bin/bash set -ex # Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm @@ -50,29 +51,15 @@ do cp $lib $TRITON_ROCM_DIR/lib/ done -# Required ROCm libraries -if [[ "${MAJOR_VERSION}" == "6" ]]; then - libamdhip="libamdhip64.so.6" -else - libamdhip="libamdhip64.so.5" -fi - # Required ROCm libraries - ROCm 6.0 ROCM_SO=( - "${libamdhip}" - "libhsa-runtime64.so.1" - "libdrm.so.2" - "libdrm_amdgpu.so.1" + "libamdhip64.so" + "libhsa-runtime64.so" + "libdrm.so" + "libdrm_amdgpu.so" + "libamd_comgr.so" + "librocprofiler-register.so" ) -if [[ $ROCM_INT -ge 60400 ]]; then - ROCM_SO+=("libamd_comgr.so.3") -else - ROCM_SO+=("libamd_comgr.so.2") -fi - -if [[ $ROCM_INT -ge 60100 ]]; then - ROCM_SO+=("librocprofiler-register.so.0") -fi for lib in "${ROCM_SO[@]}" do @@ -94,10 +81,6 @@ do fi cp $file_path $TRITON_ROCM_DIR/lib - # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions - LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g') - ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME - done # Copy Include Files diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh index 3669134631546..fb3c0f36ddb47 100755 --- a/.github/scripts/amd/patch_triton_wheel.sh +++ b/.github/scripts/amd/patch_triton_wheel.sh @@ -19,15 +19,13 @@ replace_needed_sofiles() { find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do origname=$2 patchedname=$3 - if [[ "$origname" != "$patchedname" ]]; then - set +e - origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*") - ERRCODE=$? - set -e - if [ "$ERRCODE" -eq "0" ]; then - echo "patching $sofile entry $origname to $patchedname" - $PATCHELF_BIN --replace-needed $origname $patchedname $sofile - fi + set +e + origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*") + ERRCODE=$? + set -e + if [ "$ERRCODE" -eq "0" ]; then + echo "patching $sofile entry $origname to $patchedname" + $PATCHELF_BIN --replace-needed $origname $patchedname $sofile fi done } From b4c293a73d6d7d340dbcafb9c31d60f55313e2be Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 03:33:26 +0000 Subject: [PATCH 04/33] [release/2.8] Replace upstream install_rocm_magma.sh with rocm fork version since we need to rebuild magma for all supported architectures, we cannot use upstream magma tarball anyway --- .ci/docker/common/install_rocm_magma.sh | 82 +++++++++++++++++-------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index 364ee23b97e57..db826ed6e0278 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -1,32 +1,60 @@ -#!/usr/bin/env bash -# Script used only in CD pipeline +#!/bin/bash +# Script used in CI and CD pipeline -set -eou pipefail +set -ex -function do_install() { - rocm_version=$1 - rocm_version_nodot=${1//./} +ver() { + printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); +} - # Version 2.7.2 + ROCm related updates - MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 - magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" +# Magma build scripts need `python` +ln -sf /usr/bin/python3 /usr/bin/python - rocm_dir="/opt/rocm" - ( - set -x - tmp_dir=$(mktemp -d) - pushd ${tmp_dir} - curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive} - if tar -xvf "${magma_archive}" - then - mkdir -p "${rocm_dir}/magma" - mv include "${rocm_dir}/magma/include" - mv lib "${rocm_dir}/magma/lib" - else - echo "${magma_archive} not found, skipping magma install" - fi - popd - ) -} +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + almalinux) + yum install -y gcc-gfortran + ;; + *) + echo "No preinstalls to build magma..." + ;; +esac + +MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION} + +# "install" hipMAGMA into /opt/rocm/magma by copying after build +if [[ $(ver $ROCM_VERSION) -ge $(ver 7.0) ]]; then + git clone https://github.com/ROCm/utk-magma.git -b release/2.9.0_rocm70 magma + pushd magma + # version 2.9 + ROCm 7.0 related updates + git checkout 91c4f720a17e842b364e9de41edeef76995eb9ad +else + git clone https://bitbucket.org/icl/magma.git + pushd magma + # Version 2.7.2 + ROCm related updates + git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6 +fi -do_install $1 +cp make.inc-examples/make.inc.hip-gcc-mkl make.inc +echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc +if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then + echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc +fi +echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc +echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc +export PATH="${PATH}:/opt/rocm/bin" +if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then + amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'` +else + amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs` +fi +for arch in $amdgpu_targets; do + echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc +done +# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition +sed -i 's/^FOPENMP/#FOPENMP/g' make.inc +make -f make.gen.hipMAGMA -j $(nproc) +LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}" +make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}" +popd +mv magma /opt/rocm From 9ed3d2e978a03b2c28109d6485db39d919fc2385 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 03:43:38 +0000 Subject: [PATCH 05/33] [release/2.8] Upgrade numpy versions; Use different package versions for py3.9; upgrade tensorboard compatible with numpy 2 Co-authored-by: Ethan Wee (cherry picked from commit e867a3de4b0196621e8e53d5338a8bb8bb62e828) (cherry picked from commit c7a1e32fbcf9e0a458d959a453de65c27c51452c) (cherry picked from commit 2a215e4a2115c999e4bb058956d888aed67787d1) (cherry picked from commit 866cc1dbb9c93f807af1ef59801c645062cbb95e) --- .ci/docker/requirements-ci.txt | 21 ++++++++------------- requirements.txt | 3 ++- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 9f9360fc53270..72811c384900c 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -110,10 +110,8 @@ ninja==1.11.1.3 #Pinned versions: 1.11.1.3 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py -numba==0.49.0 ; python_version < "3.9" -numba==0.55.2 ; python_version == "3.9" -numba==0.55.2 ; python_version == "3.10" -numba==0.60.0 ; python_version == "3.12" +numba==0.60.0 ; python_version == "3.9" +numba==0.61.2 ; python_version > "3.9" #Description: Just-In-Time Compiler for Numerical Functions #Pinned versions: 0.54.1, 0.49.0, <=0.49.1 #test that import: test_numba_integration.py @@ -131,12 +129,10 @@ numba==0.60.0 ; python_version == "3.12" #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, #test_binary_ufuncs.py -numpy==1.22.4; python_version == "3.9" or python_version == "3.10" -numpy==1.26.2; python_version == "3.11" or python_version == "3.12" -numpy==2.1.2; python_version >= "3.13" +numpy==2.0.2 ; python_version == "3.9" +numpy==2.1.2 ; python_version > "3.9" -pandas==2.0.3; python_version < "3.13" -pandas==2.2.3; python_version >= "3.13" +pandas==2.2.3 #onnxruntime #Description: scoring engine for Open Neural Network Exchange (ONNX) models @@ -247,8 +243,8 @@ scikit-image==0.22.0 ; python_version >= "3.10" #Pinned versions: 0.20.3 #test that import: -scipy==1.10.1 ; python_version <= "3.11" -scipy==1.14.1 ; python_version >= "3.12" +scipy==1.13.1 ; python_version == "3.9" +scipy==1.14.1 ; python_version > "3.9" # Pin SciPy because of failing distribution tests (see #60347) #Description: scientific python #Pinned versions: 1.10.1 @@ -312,8 +308,7 @@ z3-solver==4.12.6.0 #Pinned versions: #test that import: -tensorboard==2.13.0 ; python_version < "3.13" -tensorboard==2.18.0 ; python_version >= "3.13" +tensorboard==2.18.0 #Description: Also included in .ci/docker/requirements-docs.txt #Pinned versions: #test that import: test_tensorboard diff --git a/requirements.txt b/requirements.txt index 18f7810de9512..f65837a0097e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,8 @@ jinja2 lintrunner ; platform_machine != "s390x" networkx ninja -numpy +numpy==2.0.2 ; python_version == "3.9" +numpy==2.1.2 ; python_version > "3.9" optree>=0.13.0 packaging psutil From 12508fdaa70eba2f786c2438465ea590d8e83749 Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Wed, 18 Jun 2025 22:32:20 -0700 Subject: [PATCH 06/33] [release/2.8] Removing --user flag from all pip install commands From upstream PR https://github.com/pytorch/pytorch/pull/154900 Resolves:https://ontrack-internal.amd.com/browse/SWDEV-536994 After following steps to reproduce in container **registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16231_ubuntu22.04_py3.10_pytorch_lw_release2.7_no_user_66a18277**: ``` root@ubb4-rack-22:/var/lib/jenkins/pytorch# history 1 cd /var/lib/jenkins/pytorch 2 TEST_CONFIG=default CONTINUE_THROUGH_ERROR=True .ci/pytorch/test.sh Name: gfx90a Marketing Name: AMD Instinct MI250X/MI250 + MAYBE_ROCM=rocm/ + [[ rocm == *xpu* ]] + [[ rocm != *-bazel-* ]] + pip_install ninja==1.10.2 + pip_install_pkg='python3 -m pip install --progress-bar off' + python3 -m pip install --progress-bar off ninja==1.10.2 Collecting ninja==1.10.2 Downloading ninja-1.10.2-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (5.0 kB) Downloading ninja-1.10.2-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (108 kB) Installing collected packages: ninja Attempting uninstall: ninja Found existing installation: ninja 1.11.1.4 Uninstalling ninja-1.11.1.4: Successfully uninstalled ninja-1.11.1.4 Successfully installed ninja-1.10.2 + export PATH=/root/.local/bin:/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + PATH=/root/.local/bin:/opt/venv/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + [[ rocm == *aarch64* ]] + [[ rocm == *asan* ]] + [[ rocm == *-debug* ]] + [[ rocm != *-bazel-* ]] + echo 'We are not in debug mode: rocm. Expect the assertion to pas ``` http://rocm-ci.amd.com/job/mainline-pytorch2.7-manylinux-wheels/126/ --------- Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> (cherry picked from commit 0bd4030892d02407ccb9f844727326579764c6b4) --- .ci/caffe2/test.sh | 6 +++--- .ci/onnx/test.sh | 2 +- .ci/pytorch/common_utils.sh | 18 +++++++++--------- .ci/pytorch/test.sh | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.ci/caffe2/test.sh b/.ci/caffe2/test.sh index eaef1e3ebf88a..7d1ce2fb4fa10 100755 --- a/.ci/caffe2/test.sh +++ b/.ci/caffe2/test.sh @@ -5,7 +5,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" if [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then pip install click mock tabulate networkx==2.0 - pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx" + pip -q install "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx" fi # Skip tests in environments where they are not built/applicable @@ -147,8 +147,8 @@ export DNNL_MAX_CPU_ISA=AVX2 if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then # TODO(sdym@meta.com) remove this when the linked issue resolved. # py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed - pip install --user py==1.11.0 - pip install --user pytest-sugar + pip install py==1.11.0 + pip install pytest-sugar # NB: Warnings are disabled because they make it harder to see what # the actual erroring test is "$PYTHON" \ diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh index a7d3b72c62a7e..d42ca2c218dec 100755 --- a/.ci/onnx/test.sh +++ b/.ci/onnx/test.sh @@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # TODO: This can be removed later once vision is also part of the Docker image - pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)" + pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)" # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 8b05766ef4002..092d88d6387fb 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -127,9 +127,9 @@ function install_torchaudio() { if [[ "$1" == "cuda" ]]; then # TODO: This is better to be passed as a parameter from _linux-test workflow # so that it can be consistent with what is set in build - TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}" + TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}" else - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}" fi } @@ -139,8 +139,8 @@ function install_torchtext() { local text_commit data_commit=$(get_pinned_commit data) text_commit=$(get_pinned_commit text) - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/data.git@${data_commit}" - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/text.git@${text_commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/data.git@${data_commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/text.git@${text_commit}" } function install_torchvision() { @@ -153,7 +153,7 @@ function install_torchvision() { echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c - LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so fi - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/vision.git@${commit}" if [ -n "${LD_PRELOAD}" ]; then LD_PRELOAD=${orig_preload} fi @@ -173,7 +173,7 @@ function install_torchrec_and_fbgemm() { if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" pip_uninstall fbgemm-gpu-nightly pip_install tabulate # needed for newer fbgemm @@ -190,8 +190,8 @@ function install_torchrec_and_fbgemm() { rm -rf fbgemm else # See https://github.com/pytorch/pytorch/issues/106971 - CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu" - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" + CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu" + pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" fi } @@ -234,7 +234,7 @@ function checkout_install_torchbench() { function install_torchao() { local commit commit=$(get_pinned_commit torchao) - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}" + pip_install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${commit}" } function print_sccache_stats() { diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 425cc2a80dc73..52fb572b81c46 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -201,7 +201,7 @@ fi if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then # JIT C++ extensions require ninja. - pip_install --user "ninja==1.10.2" + pip_install "ninja==1.10.2" # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins # but this script should be runnable by any user, including root export PATH="$HOME/.local/bin:$PATH" @@ -502,7 +502,7 @@ DYNAMO_BENCHMARK_FLAGS=() pr_time_benchmarks() { - pip_install --user "fbscribelogger" + pip_install "fbscribelogger" TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" @@ -1469,8 +1469,8 @@ test_bazel() { test_benchmarks() { if [[ "$BUILD_ENVIRONMENT" == *cuda* && $TEST_CONFIG != *nogpu* ]]; then - pip_install --user "pytest-benchmark==3.2.3" - pip_install --user "requests" + pip_install "pytest-benchmark==3.2.3" + pip_install "requests" BENCHMARK_DATA="benchmarks/.data" mkdir -p ${BENCHMARK_DATA} pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default From 90d7d4b7726ccff40ffde92138e1660efe22d0aa Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 04:21:26 +0000 Subject: [PATCH 07/33] [ROCm] Remove use of warpsize on host-side compilation (pytorch#156979) (cherry picked from upstream commit 04bd7e6850e8efec77994963ffee87549555b9c3) --- aten/src/ATen/native/cuda/Embedding.cu | 2 +- .../src/ATen/native/cuda/MultinomialKernel.cu | 2 +- aten/src/ATen/native/cuda/SoftMax.cu | 19 ++++++++++--------- aten/src/ATen/native/cuda/TensorModeKernel.cu | 2 +- aten/src/ATen/native/cuda/TensorTopK.cu | 4 ++++ aten/src/ATen/native/cuda/block_reduce.cuh | 12 +++++++++++- c10/macros/Macros.h | 16 +++++++++++++++- .../c10d/symm_mem/CUDASymmetricMemory.cu | 6 +++--- .../c10d/symm_mem/CUDASymmetricMemoryOps.cu | 2 +- 9 files changed, 47 insertions(+), 18 deletions(-) diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 5d19b95b32f9b..4b1e420d5da98 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -369,7 +369,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, int warp_size = at::cuda::warp_size(); TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 && - num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads, + num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(), "BlockReduceSum requires all warps be active"); const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr(); dim3 grid = unique_indices.numel(); diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index 65770e40a8b2b..8132e7df57b51 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -86,7 +86,7 @@ void renormRows(Tensor& t) { TORCH_CHECK(props != nullptr); int numSM = props->multiProcessorCount; const int64_t maxThreads = std::min( - props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads); + props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads()); int warp_size = at::cuda::warp_size(); dim3 grid(rows < numSM * 4 ? rows : numSM * 4); diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index 5157d37f68b34..f27d76256cdb3 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -183,15 +183,16 @@ inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) { uint64_t block_size = 1; uint64_t max_block_size = std::min(dim_size, static_cast(max_threads)); - // We need a block size that is a multiple of C10_WARP_SIZE in order + // We need a block size that is a multiple of at::cuda::warp_size() in order // to perform block size reductions using warp shuffle instructions. - // Since max_threads is also a multiple of C10_WARPS_SIZE we do not + // Since max_threads is also a multiple of at::cuda::warp_size() we do not // risk creating a block size larger than the limit. - if (max_block_size % C10_WARP_SIZE == 0) { + int warp_size = at::cuda::warp_size(); + if (max_block_size % warp_size == 0) { block_size = max_block_size; } else { - block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE; + block_size = (max_block_size / warp_size + 1) * warp_size; } return dim3(block_size); @@ -1107,7 +1108,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t constexpr int ILP = sizeof(float4) / sizeof(scalar_t); if constexpr (use_fast_softmax) { dim3 block(512); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); if (dim_size % ILP == 0) { cunn_SoftMaxForwardGmem <<>>(output_ptr, input_ptr, dim_size); @@ -1117,7 +1118,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t } } else { dim3 block = SoftMaxForward_getBlockSize(dim_size); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); @@ -1198,7 +1199,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t constexpr int ILP = sizeof(float4) / sizeof(scalar_t); if constexpr (use_fast_softmax) { dim3 block(512); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); if (dim_size % ILP == 0) { cunn_SoftMaxForwardGmem <<>>(output_ptr, input_ptr, dim_size); @@ -1208,7 +1209,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t } } else { dim3 block = SoftMaxForward_getBlockSize(dim_size); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); @@ -1274,7 +1275,7 @@ void dispatch_host_softmax_backward(int64_t dim_size, dim3 grid, Tensor &grad, T constexpr int ILP = sizeof(float4) / sizeof(output_t); dim3 block = SoftMax_getBlockSize(ILP, dim_size); - size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(output_t); bool can_use_smem = static_cast(dim_size) < max_elements_per_smem; diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu index 4764b078c050b..0c97ab742103f 100644 --- a/aten/src/ATen/native/cuda/TensorModeKernel.cu +++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu @@ -207,7 +207,7 @@ void handle_fused_mode( constexpr int num_threads = size / 2; int warp_size = at::cuda::warp_size(); TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 && - num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, ""); + num_threads <= cuda_utils::kCUDABlockReduceMaxThreads(), ""); const auto memsize = (sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int)); compute_mode diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index 103b360bcb868..49086c42cd4a2 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -439,8 +439,12 @@ __global__ void computeBlockwiseWithinKCounts( warp_counts[warp] = count; } __syncthreads(); +#ifdef USE_ROCM + CUDA_KERNEL_ASSERT(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE); +#else static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE, "Assuming only 1 warp is needed for final reduction"); +#endif if (warp != 0) { return; } diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh index 2a272d22c0c60..1818987c6a588 100644 --- a/aten/src/ATen/native/cuda/block_reduce.cuh +++ b/aten/src/ATen/native/cuda/block_reduce.cuh @@ -12,7 +12,17 @@ constexpr int kCUDABlockReduceNumThreads = 512; // of which reduces C10_WARP_SIZE elements. So, at most // C10_WARP_SIZE**2 elements can be reduced at a time. // NOTE: This is >= the max block size on current hardware anyway (1024). -constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE; +// ROCm NOTE: C10_WARP_SIZE should only be used inside device functions, +// and kCUDABlockReduceMaxThreads is a host-side variable. +#ifdef USE_ROCM +static int kCUDABlockReduceMaxThreads() { + return at::cuda::warp_size() * at::cuda::warp_size(); +} +#else +constexpr int kCUDABlockReduceMaxThreads() { + return C10_WARP_SIZE * C10_WARP_SIZE; +} +#endif // Sums `val` across all threads in a warp. // diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 7d8238f910464..6b51a39f2a943 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -312,7 +312,21 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #endif #if defined(USE_ROCM) -#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h) +// C10_WARP_SIZE is only allowed for device code. +// Host code _must_ use at::cuda::warp_size() +// HIP header used to define warpSize as a constexpr that was either 32 or 64 +// depending on the target device, and then always set it to 64 for host code. +// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we +// set it to something unreasonable to trigger obvious host code errors. +#if defined(__HIP_DEVICE_COMPILE__) +#if defined(__GFX9__) +static constexpr int C10_WARP_SIZE = 64; +#else // __GFX9__ +static constexpr int C10_WARP_SIZE = 32; +#endif // __GFX9__ +#else +static constexpr int C10_WARP_SIZE = 1; +#endif // __HIP_DEVICE_COMPILE__ #else #define C10_WARP_SIZE 32 #endif diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu index 4cc29b0c347fc..20ccf0d74b601 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu @@ -255,7 +255,7 @@ static __global__ void barrier_kernel( void CUDASymmetricMemory::barrier(int channel, size_t timeout_ms) { check_channel(channel, world_size_); c10::cuda::CUDAGuard guard(local_device_idx_); - barrier_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( + barrier_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>( reinterpret_cast(signal_pads_dev_), channel, rank_, @@ -293,7 +293,7 @@ void CUDASymmetricMemory::put_signal( size_t timeout_ms) { check_channel(channel, world_size_); c10::cuda::CUDAGuard guard(local_device_idx_); - put_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( + put_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>( reinterpret_cast(signal_pads_dev_), dst_rank, channel, @@ -337,7 +337,7 @@ void CUDASymmetricMemory::wait_signal( size_t timeout_ms) { check_channel(channel, world_size_); c10::cuda::CUDAGuard guard(local_device_idx_); - wait_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>( + wait_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>( reinterpret_cast(signal_pads_dev_), src_rank, channel, diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu index d7652b77ebab4..a2d5f8f9f67ba 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu @@ -114,7 +114,7 @@ void init_elementwise_launch_config( num_blocks = 1; num_threads = at::round_up( at::ceil_div(numel_per_split, numel_per_thread), - static_cast(C10_WARP_SIZE)); + static_cast(at::cuda::warp_size())); } else { num_blocks = std::min( at::ceil_div(numel_per_split, max_num_threads * numel_per_thread), From 186180d9aa1dd92919be12b7a1a8b4d9db5a8479 Mon Sep 17 00:00:00 2001 From: Xinya Zhang Date: Tue, 15 Jul 2025 23:12:21 -0500 Subject: [PATCH 08/33] [release/2.8] Improve C10_WARP_SIZE compatibility Cherry-pick of https://github.com/ROCm/pytorch/pull/2328 Co-authored-by: Xinya Zhang Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> (cherry picked from commit fe3d37a958142978d027dc921d66b46ecc6ffdad) --- .../src/ATen/native/cuda/layer_norm_kernel.cu | 5 ++++ .../sparse/cuda/SparseCUDAApplyUtils.cuh | 4 +++ c10/macros/Macros.h | 30 ++++++++++++++----- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index bdb169e26b142..0709164e60555 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -33,7 +33,12 @@ namespace at::native { namespace { constexpr int kCUDANumThreads = 256; +#ifdef USE_ROCM +// C10_WARP_SIZE is not constexpr for host code. +#define kWarpSize C10_WARP_SIZE +#else constexpr unsigned int kWarpSize = C10_WARP_SIZE; +#endif constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh) diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh index c9412d74e9cda..693ca536a3198 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh @@ -242,7 +242,11 @@ __global__ void coalesceValuesKernel( // `if constexpr` when CUDA codes will be compiled under C++-17, see // gh-56055 for blockers. template +#ifdef USE_ROCM +C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE_STATIC*4) +#else C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4) +#endif __global__ void coalesceValuesKernel( int64_t *segment_offsets, int64_t *value_indices, bool *values, bool *newValues, diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 6b51a39f2a943..77ca999090d93 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -318,16 +318,32 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; // depending on the target device, and then always set it to 64 for host code. // Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we // set it to something unreasonable to trigger obvious host code errors. -#if defined(__HIP_DEVICE_COMPILE__) + +namespace at::cuda { +TORCH_CUDA_CPP_API int warp_size(); +} +#ifdef __HIPCC__ +static inline int __host__ C10_WARP_SIZE_INTERNAL() { + return at::cuda::warp_size(); +} + +static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() { #if defined(__GFX9__) -static constexpr int C10_WARP_SIZE = 64; + return 64; #else // __GFX9__ -static constexpr int C10_WARP_SIZE = 32; + return 32; #endif // __GFX9__ -#else -static constexpr int C10_WARP_SIZE = 1; -#endif // __HIP_DEVICE_COMPILE__ -#else +} +#else // __HIPCC__ +inline int C10_WARP_SIZE_INTERNAL() { + return at::cuda::warp_size(); +} +#endif // __HIPCC__ + +#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL()) +#define C10_WARP_SIZE_STATIC 64 + +#else // defined(USE_ROCM) #define C10_WARP_SIZE 32 #endif From 8e7b99f5b048edae5df3f2557dc8d0e9085be867 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 06:18:32 +0000 Subject: [PATCH 09/33] Fix sha256 for aotriton ROCm7.0 tarball --- cmake/External/aotriton.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake index 8004b0f400a8d..8b380d24f6c8c 100644 --- a/cmake/External/aotriton.cmake +++ b/cmake/External/aotriton.cmake @@ -24,7 +24,7 @@ if(NOT __AOTRITON_INCLUDED) set(__AOTRITON_SHA256_LIST "861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe" # rocm6.3 "acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78" # rocm6.4 - "7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838" # rocm7.0 + "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b" # rocm7.0 ) set(__AOTRITON_Z "gz") From d7c64fc4be65e8b567b94a0649fbcc21fa9809d2 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 06:26:24 +0000 Subject: [PATCH 10/33] Update third_party/composable_kernel submodule commit as per https://github.com/ROCm/pytorch/commit/80cca7006d94df97ee932fd5903ed20c08c2eb34 to enable PyTorch build on ROCm7.0 --- third_party/composable_kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/composable_kernel b/third_party/composable_kernel index 8086bbe3a78d9..df6023e305f38 160000 --- a/third_party/composable_kernel +++ b/third_party/composable_kernel @@ -1 +1 @@ -Subproject commit 8086bbe3a78d931eb96fe12fdc014082e18d18d3 +Subproject commit df6023e305f389bbf7249b0c4414e649f3ad6598 From b81d4d1ff7ce03630110354ddfc34af708cb0d7c Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 16 Jul 2025 20:08:23 +0000 Subject: [PATCH 11/33] Use ROCm/triton and update triton.txt --- .ci/docker/ci_commit_pins/triton.txt | 2 +- .ci/docker/common/install_triton.sh | 2 +- .github/scripts/build_triton_wheel.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index 568756a804f07..cf43cba72a42b 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -ae848267bebc65c6181e8cc5e64a6357d2679260 +5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index f5e39fbaf9ccb..f5b552e079710 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then TRITON_REPO="https://github.com/triton-lang/triton-cpu" TRITON_TEXT_FILE="triton-cpu" else - TRITON_REPO="https://github.com/triton-lang/triton" + TRITON_REPO="https://github.com/ROCm/triton" TRITON_TEXT_FILE="triton" fi diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index 1302570432046..695b4a9c865a6 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -102,6 +102,7 @@ def build_triton( triton_repo = "https://github.com/openai/triton" if device == "rocm": triton_pkg_name = "pytorch-triton-rocm" + triton_repo = "https://github.com/ROCm/triton" elif device == "xpu": triton_pkg_name = "pytorch-triton-xpu" triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton" From 98e953717022b6980037b0f88ac05f5cf7a8a562 Mon Sep 17 00:00:00 2001 From: Prachi Gupta Date: Tue, 22 Jul 2025 14:31:27 -0400 Subject: [PATCH 12/33] Add related_commits file (#2396) --- related_commits | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 related_commits diff --git a/related_commits b/related_commits new file mode 100644 index 0000000000000..fd2787398fc13 --- /dev/null +++ b/related_commits @@ -0,0 +1,10 @@ +ubuntu|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex +centos|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex +ubuntu|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision +centos|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision +ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data +centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data +ubuntu|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio +centos|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio +ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao +centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao From 12a145a630f9659ceb708899af6c23c0226ba6d3 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 19 Feb 2025 18:11:37 +0000 Subject: [PATCH 13/33] Add QA automation scripts for running PyTorch unit tests (cherry picked from commit ba1ba2680558fee0c93ea030ab1a375c8c1a83fc) (cherry picked from commit 4e3462e257fd8245424bb0df67191659d8fadfda) --- .automation_scripts/parse_xml_results.py | 178 ++++++ .automation_scripts/run_pytorch_unit_tests.py | 518 ++++++++++++++++++ 2 files changed, 696 insertions(+) create mode 100644 .automation_scripts/parse_xml_results.py create mode 100644 .automation_scripts/run_pytorch_unit_tests.py diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py new file mode 100644 index 0000000000000..7db2e1ce9233c --- /dev/null +++ b/.automation_scripts/parse_xml_results.py @@ -0,0 +1,178 @@ +""" The Python PyTorch testing script. +## +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" + +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any, Dict, Tuple + +# Backends list +BACKENDS_LIST = [ + "dist-gloo", + "dist-nccl" +] + +TARGET_WORKFLOW = "--rerun-disabled-tests" + +def get_job_id(report: Path) -> int: + # [Job id in artifacts] + # Retrieve the job id from the report path. In our GHA workflows, we append + # the job id to the end of the report name, so `report` looks like: + # unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml + # and we want to get `5596745227` out of it. + try: + return int(report.parts[0].rpartition("_")[2]) + except ValueError: + return -1 + +def is_rerun_disabled_tests(root: ET.ElementTree) -> bool: + """ + Check if the test report is coming from rerun_disabled_tests workflow + """ + skipped = root.find(".//*skipped") + # Need to check against None here, if not skipped doesn't work as expected + if skipped is None: + return False + + message = skipped.attrib.get("message", "") + return TARGET_WORKFLOW in message or "num_red" in message + +def parse_xml_report( + tag: str, + report: Path, + workflow_id: int, + workflow_run_attempt: int, + work_flow_name: str +) -> Dict[Tuple[str], Dict[str, Any]]: + """Convert a test report xml file into a JSON-serializable list of test cases.""" + print(f"Parsing {tag}s for test report: {report}") + + job_id = get_job_id(report) + print(f"Found job id: {job_id}") + + test_cases: Dict[Tuple[str], Dict[str, Any]] = {} + + root = ET.parse(report) + # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops + # includes skipped messages multiple times (50 times by default). This slows down + # this script too much (O(n)) because it tries to gather all the stats. This should + # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun + # disabled test is only few MB, but will balloon up to a much bigger XML file after + # extracting from a dozen to few hundred MB + if is_rerun_disabled_tests(root): + return test_cases + + for test_case in root.iter(tag): + case = process_xml_element(test_case) + if tag == 'testcase': + case["workflow_id"] = workflow_id + case["workflow_run_attempt"] = workflow_run_attempt + case["job_id"] = job_id + case["work_flow_name"] = work_flow_name + + # [invoking file] + # The name of the file that the test is located in is not necessarily + # the same as the name of the file that invoked the test. + # For example, `test_jit.py` calls into multiple other test files (e.g. + # jit/test_dce.py). For sharding/test selection purposes, we want to + # record the file that invoked the test. + # + # To do this, we leverage an implementation detail of how we write out + # tests (https://bit.ly/3ajEV1M), which is that reports are created + # under a folder with the same name as the invoking file. + case_name = report.parent.name + for ind in range(len(BACKENDS_LIST)): + if BACKENDS_LIST[ind] in report.parts: + case_name = case_name + "_" + BACKENDS_LIST[ind] + break + case["invoking_file"] = case_name + test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case + elif tag == 'testsuite': + case["work_flow_name"] = work_flow_name + case["invoking_xml"] = report.name + case["running_time_xml"] = case["time"] + case_name = report.parent.name + for ind in range(len(BACKENDS_LIST)): + if BACKENDS_LIST[ind] in report.parts: + case_name = case_name + "_" + BACKENDS_LIST[ind] + break + case["invoking_file"] = case_name + + test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case + + return test_cases + +def process_xml_element(element: ET.Element) -> Dict[str, Any]: + """Convert a test suite element into a JSON-serializable dict.""" + ret: Dict[str, Any] = {} + + # Convert attributes directly into dict elements. + # e.g. + # + # becomes: + # {"name": "test_foo", "classname": "test_bar"} + ret.update(element.attrib) + + # The XML format encodes all values as strings. Convert to ints/floats if + # possible to make aggregation possible in Rockset. + for k, v in ret.items(): + try: + ret[k] = int(v) + except ValueError: + pass + try: + ret[k] = float(v) + except ValueError: + pass + + # Convert inner and outer text into special dict elements. + # e.g. + # my_inner_text my_tail + # becomes: + # {"text": "my_inner_text", "tail": " my_tail"} + if element.text and element.text.strip(): + ret["text"] = element.text + if element.tail and element.tail.strip(): + ret["tail"] = element.tail + + # Convert child elements recursively, placing them at a key: + # e.g. + # + # hello + # world + # another + # + # becomes + # { + # "foo": [{"text": "hello"}, {"text": "world"}], + # "bar": {"text": "another"} + # } + for child in element: + if child.tag not in ret: + ret[child.tag] = process_xml_element(child) + else: + # If there are multiple tags with the same name, they should be + # coalesced into a list. + if not isinstance(ret[child.tag], list): + ret[child.tag] = [ret[child.tag]] + ret[child.tag].append(process_xml_element(child)) + return ret \ No newline at end of file diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py new file mode 100644 index 0000000000000..514afd19624c3 --- /dev/null +++ b/.automation_scripts/run_pytorch_unit_tests.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 + +""" The Python PyTorch testing script. +## +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" + +import argparse +import os +import shutil +import subprocess +from subprocess import STDOUT, CalledProcessError + +from collections import namedtuple +from datetime import datetime +from pathlib import Path +from parse_xml_results import ( + parse_xml_report +) +from pprint import pprint +from typing import Any, Dict, List + +# unit test status list +UT_STATUS_LIST = [ + "PASSED", + "MISSED", + "SKIPPED", + "FAILED", + "XFAILED", + "ERROR" +] + +DEFAULT_CORE_TESTS = [ + "test_nn", + "test_torch", + "test_cuda", + "test_ops", + "test_unary_ufuncs", + "test_autograd", + "inductor/test_torchinductor" +] + +DISTRIBUTED_CORE_TESTS = [ + "distributed/test_c10d_common", + "distributed/test_c10d_nccl", + "distributed/test_distributed_spawn" +] + +CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log" + +def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."): + test_cases = {} + items_list = os.listdir(path) + for dir in items_list: + new_dir = path + '/' + dir + '/' + if os.path.isdir(new_dir): + for xml_report in Path(new_dir).glob("**/*.xml"): + test_cases.update( + parse_xml_report( + tag, + xml_report, + workflow_run_id, + workflow_run_attempt, + workflow_name + ) + ) + return test_cases + +def get_test_status(test_case): + # In order of priority: S=skipped, F=failure, E=error, P=pass + if "skipped" in test_case and test_case["skipped"]: + type_message = test_case["skipped"] + if type_message.__contains__('type') and type_message['type'] == "pytest.xfail": + return "XFAILED" + else: + return "SKIPPED" + elif "failure" in test_case and test_case["failure"]: + return "FAILED" + elif "error" in test_case and test_case["error"]: + return "ERROR" + else: + return "PASSED" + +def get_test_message(test_case, status=None): + if status == "SKIPPED": + return test_case["skipped"] if "skipped" in test_case else "" + elif status == "FAILED": + return test_case["failure"] if "failure" in test_case else "" + elif status == "ERROR": + return test_case["error"] if "error" in test_case else "" + else: + if "skipped" in test_case: + return test_case["skipped"] + elif "failure" in test_case: + return test_case["failure"] + elif "error" in test_case: + return test_case["error"] + else: + return "" + +def get_test_file_running_time(test_suite): + if test_suite.__contains__('time'): + return test_suite["time"] + return 0 + +def get_test_running_time(test_case): + if test_case.__contains__('time'): + return test_case["time"] + return "" + +def summarize_xml_files(path, workflow_name): + # statistics + TOTAL_TEST_NUM = 0 + TOTAL_PASSED_NUM = 0 + TOTAL_SKIPPED_NUM = 0 + TOTAL_XFAIL_NUM = 0 + TOTAL_FAILED_NUM = 0 + TOTAL_ERROR_NUM = 0 + TOTAL_EXECUTION_TIME = 0 + + #parse the xml files + test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path) + test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path) + test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"]) + # results dict + res = {} + res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ] + test_file_items = set() + for (k,v) in list(test_suites.items()): + file_name = k[0] + if not file_name in test_file_items: + test_file_items.add(file_name) + # initialization + for item in res_item_list: + temp_item = test_file_and_status(file_name, item) + res[temp_item] = {} + temp_item_statistics = test_file_and_status(file_name, "STATISTICS") + res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0} + test_running_time = get_test_file_running_time(v) + res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time + TOTAL_EXECUTION_TIME += test_running_time + else: + test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS") + test_running_time = get_test_file_running_time(v) + res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time + TOTAL_EXECUTION_TIME += test_running_time + + for (k,v) in list(test_cases.items()): + file_name = k[0] + class_name = k[1] + test_name = k[2] + combined_name = file_name + "::" + class_name + "::" + test_name + test_status = get_test_status(v) + test_running_time = get_test_running_time(v) + test_message = get_test_message(v, test_status) + test_info_value = "" + test_tuple_key_status = test_file_and_status(file_name, test_status) + test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS") + TOTAL_TEST_NUM += 1 + res[test_tuple_key_statistics]["TOTAL"] += 1 + if test_status == "PASSED": + test_info_value = str(test_running_time) + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["PASSED"] += 1 + TOTAL_PASSED_NUM += 1 + elif test_status == "SKIPPED": + test_info_value = str(test_running_time) + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["SKIPPED"] += 1 + TOTAL_SKIPPED_NUM += 1 + elif test_status == "XFAILED": + test_info_value = str(test_running_time) + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["XFAILED"] += 1 + TOTAL_XFAIL_NUM += 1 + elif test_status == "FAILED": + test_info_value = test_message + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["FAILED"] += 1 + TOTAL_FAILED_NUM += 1 + elif test_status == "ERROR": + test_info_value = test_message + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["ERROR"] += 1 + TOTAL_ERROR_NUM += 1 + + # generate statistics_dict + statistics_dict = {} + statistics_dict["TOTAL"] = TOTAL_TEST_NUM + statistics_dict["PASSED"] = TOTAL_PASSED_NUM + statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM + statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM + statistics_dict["FAILED"] = TOTAL_FAILED_NUM + statistics_dict["ERROR"] = TOTAL_ERROR_NUM + statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME + aggregate_item = workflow_name + "_aggregate" + total_item = test_file_and_status(aggregate_item, "STATISTICS") + res[total_item] = statistics_dict + + return res + +def run_command_and_capture_output(cmd): + try: + print(f"Running command '{cmd}'") + with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file: + print(f"========================================", file=output_file, flush=True) + print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well + print(f"========================================", file=output_file, flush=True) + p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True) + except CalledProcessError as e: + print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!") + +def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src): + if os.path.exists(test_reports_src): + shutil.rmtree(test_reports_src) + + os.mkdir(test_reports_src) + copied_logs_path = "" + if workflow_name == "default": + os.environ['TEST_CONFIG'] = 'default' + copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/" + elif workflow_name == "distributed": + os.environ['TEST_CONFIG'] = 'distributed' + copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/" + elif workflow_name == "inductor": + os.environ['TEST_CONFIG'] = 'inductor' + copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/" + # use test.sh for tests execution + run_command_and_capture_output(test_shell_path) + copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path) + entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name) + return entire_results_dict + +def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src): + if os.path.exists(test_reports_src): + shutil.rmtree(test_reports_src) + + os.mkdir(test_reports_src) + copied_logs_path = "" + if workflow_name == "default": + os.environ['TEST_CONFIG'] = 'default' + os.environ['HIP_VISIBLE_DEVICES'] = '0' + copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/" + # use run_test.py for tests execution + default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS) + command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + elif workflow_name == "distributed": + os.environ['TEST_CONFIG'] = 'distributed' + os.environ['HIP_VISIBLE_DEVICES'] = '0,1' + copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/" + # use run_test.py for tests execution + distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS) + command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path) + priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name) + + return priority_results_dict + +def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list): + if os.path.exists(test_reports_src): + shutil.rmtree(test_reports_src) + + os.mkdir(test_reports_src) + copied_logs_path = "" + if workflow_name == "default": + os.environ['TEST_CONFIG'] = 'default' + os.environ['HIP_VISIBLE_DEVICES'] = '0' + copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/" + # use run_test.py for tests execution + default_selected_test_suites = " ".join(selected_list) + command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + elif workflow_name == "distributed": + os.environ['TEST_CONFIG'] = 'distributed' + os.environ['HIP_VISIBLE_DEVICES'] = '0,1' + copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/" + # use run_test.py for tests execution + distributed_selected_test_suites = " ".join(selected_list) + command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + elif workflow_name == "inductor": + os.environ['TEST_CONFIG'] = 'inductor' + copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/" + inductor_selected_test_suites = "" + non_inductor_selected_test_suites = "" + for item in selected_list: + if "inductor/" in item: + inductor_selected_test_suites += item + inductor_selected_test_suites += " " + else: + non_inductor_selected_test_suites += item + non_inductor_selected_test_suites += " " + if inductor_selected_test_suites != "": + inductor_selected_test_suites = inductor_selected_test_suites[:-1] + command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose" + run_command_and_capture_output(command) + if non_inductor_selected_test_suites != "": + non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1] + command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose" + run_command_and_capture_output(command) + copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path) + selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name) + + return selected_results_dict + +def run_test_and_summarize_results( + pytorch_root_dir: str, + priority_tests: bool, + test_config: List[str], + default_list: List[str], + distributed_list: List[str], + inductor_list: List[str], + skip_rerun: bool) -> Dict[str, Any]: + + # copy current environment variables + _environ = dict(os.environ) + + # modify path + test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh" + test_run_test_path = pytorch_root_dir + "/test/run_test.py" + repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/" + test_reports_src = pytorch_root_dir + "/test/test-reports/" + run_test_python_file = pytorch_root_dir + "/test/run_test.py" + + # change directory to pytorch root + os.chdir(pytorch_root_dir) + + # all test results dict + res_all_tests_dict = {} + + # patterns + search_text = "--reruns=2" + replace_text = "--reruns=0" + + # create logs folder + if not os.path.exists(repo_test_log_folder_path): + os.mkdir(repo_test_log_folder_path) + + # Set common environment variables for all scenarios + os.environ['CI'] = '1' + os.environ['PYTORCH_TEST_WITH_ROCM'] = '1' + os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1' + os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda' + os.environ['CONTINUE_THROUGH_ERROR'] = 'True' + if skip_rerun: + # modify run_test.py in-place + with open(run_test_python_file, 'r') as file: + data = file.read() + data = data.replace(search_text, replace_text) + with open(run_test_python_file, 'w') as file: + file.write(data) + + # Time stamp + current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S") + print("Current date & time : ", current_datetime) + # performed as Job ID + str_current_datetime = str(current_datetime) + overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/" + os.mkdir(overall_logs_path_current_run) + + global CONSOLIDATED_LOG_FILE_PATH + CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME + + # Check multi gpu availability if distributed tests are enabled + if ("distributed" in test_config) or len(distributed_list) != 0: + check_num_gpus_for_distributed() + + # Install test requirements + command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt" + run_command_and_capture_output(command) + + # Run entire tests for each workflow + if not priority_tests and not default_list and not distributed_list and not inductor_list: + # run entire tests for default, distributed and inductor workflows → use test.sh + if not test_config: + check_num_gpus_for_distributed() + # default test process + res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_all + # distributed test process + res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_all + # inductor test process + res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["inductor"] = res_inductor_all + else: + workflow_list = [] + for item in test_config: + workflow_list.append(item) + if "default" in workflow_list: + res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_all + if "distributed" in workflow_list: + res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_all + if "inductor" in workflow_list: + res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["inductor"] = res_inductor_all + # Run priority test for each workflow + elif priority_tests and not default_list and not distributed_list and not inductor_list: + if not test_config: + check_num_gpus_for_distributed() + # default test process + res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_priority + # distributed test process + res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_priority + # will not run inductor priority tests + print("Inductor priority tests cannot run since no core tests defined with inductor workflow.") + else: + workflow_list = [] + for item in test_config: + workflow_list.append(item) + if "default" in workflow_list: + res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_priority + if "distributed" in workflow_list: + res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_priority + if "inductor" in workflow_list: + print("Inductor priority tests cannot run since no core tests defined with inductor workflow.") + # Run specified tests for each workflow + elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests: + if default_list: + default_workflow_list = [] + for item in default_list: + default_workflow_list.append(item) + res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list) + res_all_tests_dict["default"] = res_default_selected + if distributed_list: + distributed_workflow_list = [] + for item in distributed_list: + distributed_workflow_list.append(item) + res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list) + res_all_tests_dict["distributed"] = res_distributed_selected + if inductor_list: + inductor_workflow_list = [] + for item in inductor_list: + inductor_workflow_list.append(item) + res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list) + res_all_tests_dict["inductor"] = res_inductor_selected + else: + raise Exception("Invalid test configurations!") + + # restore environment variables + os.environ.clear() + os.environ.update(_environ) + + # restore files + if skip_rerun: + # modify run_test.py in-place + with open(run_test_python_file, 'r') as file: + data = file.read() + data = data.replace(replace_text, search_text) + with open(run_test_python_file, 'w') as file: + file.write(data) + + return res_all_tests_dict + +def parse_args(): + parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'") + parser.add_argument('--priority_tests', action='store_true', help="run priority tests only") + parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'") + parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'") + parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'") + parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory") + parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process") + parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n" + " test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n" + "}}\n") + parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n" + "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n" + "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor") + return parser.parse_args() + +def check_num_gpus_for_distributed(): + p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True) + num_gpus_visible = int(p.stdout) + assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests" + +def main(): + args = parse_args() + all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun) + pprint(dict(all_tests_results)) + +if __name__ == "__main__": + main() From 3c7ddbfc71f24138fe78c9484ec44447dc9f65f3 Mon Sep 17 00:00:00 2001 From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> Date: Fri, 11 Apr 2025 18:08:28 +0200 Subject: [PATCH 14/33] [release/2.6] enable NHWC batchnorm with MIOpen (#2023) This PR enables NHWC batchnorm on MIOpen in release/2.6 branch `ROCm version >= 6.5` and `PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM=1` environment variable required to enable nhwc batchnorm This PR branch for `release/2.6` was built and tested using docker image: `compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:15845_ubuntu22.04_py3.10_pytorch_rocm6.4_internal_testing_8190c80`. New batchnorm tests introduced: train: ``` test_batchnorm_train_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.040s) test_batchnorm_train_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.007s) test_batchnorm_train_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.005s) test_batchnorm_train_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.089s) test_batchnorm_train_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_train_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.020s) test_batchnorm_train_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_train_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_train_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_train_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_train_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_train_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_train_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_train_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s) ``` inference: ``` test_batchnorm_inference_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.025s) test_batchnorm_inference_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.005s) test_batchnorm_inference_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_inference_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.102s) test_batchnorm_inference_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_inference_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.018s) test_batchnorm_inference_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_inference_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_inference_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_inference_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_inference_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_inference_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.003s) test_batchnorm_inference_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_inference_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.003s) ``` --------- Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> (cherry picked from commit 45896ac60e0d80f9038ec7d4d655cfd1deae4463) (cherry picked from commit 7010d60047c85d679c4a3062b3146003be8d2bfe) --- aten/src/ATen/native/Normalization.cpp | 15 +- .../ATen/native/miopen/BatchNorm_miopen.cpp | 13 +- test/test_nn.py | 175 +++++++++++++++++- tools/autograd/derivatives.yaml | 2 +- 4 files changed, 195 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index ecad7d7f34197..3f473a7146453 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -520,6 +520,11 @@ BatchNormBackend _select_batch_norm_backend( return BatchNormBackend::Cudnn; } + // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen + // See #64427 + // non static variable is used to be able to change environment variable in runtime for testing + bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(false); + if ( detail::getCUDAHooks().compiledWithMIOpen() && cudnn_enabled @@ -527,6 +532,9 @@ BatchNormBackend _select_batch_norm_backend( && input.dim() <= MIOPEN_DIM_MAX && input.dim() >= 3 && input.scalar_type() != at::kDouble +#if (defined(USE_ROCM) && ROCM_VERSION < 60400) + && (input.scalar_type() != at::kBFloat16) +#endif && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16) && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input && weight.defined() && bias.defined() @@ -534,6 +542,11 @@ BatchNormBackend _select_batch_norm_backend( || (!running_mean.defined() && !running_var.defined() && training)) && input.suggest_memory_format() != MemoryFormat::ChannelsLast && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d + && (input.suggest_memory_format() == MemoryFormat::Contiguous +#if (defined(USE_ROCM) && ROCM_VERSION >= 60500) + || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) +#endif + ) ) { return BatchNormBackend::Miopen; } @@ -613,7 +626,7 @@ std::tuple _batch_norm_impl_index( if (backend == BatchNormBackend::Miopen) { return std::tuple_cat( at::miopen_batch_norm( - input.contiguous(), weight.contiguous(), bias.contiguous(), + input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(), running_mean.defined() ? running_mean.contiguous() : running_mean, running_var.defined() ? running_var.contiguous() : running_var, training, momentum, eps), diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index af69dfc76e571..f21325cd0848f 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -102,7 +102,7 @@ std::tuple miopen_batch_norm( mode = miopenBNSpatial; } - auto output_t = at::empty(input->sizes(), input->options()); + auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format()); TensorArg output{ output_t, "output", 0 }; auto handle = getMiopenHandle(); @@ -179,8 +179,10 @@ std::tuple miopen_batch_norm_backward( const Tensor& save_var_t = save_var_t_opt.value_or(Tensor()); + auto grad_output_contig = + grad_output_t.contiguous(input_t.suggest_memory_format()); TensorArg input{ input_t, "input", 1 }, - grad_output{ grad_output_t, "grad_output", 2 }, + grad_output{ grad_output_contig, "grad_output", 2 }, weight{ weight_t, "weight", 3 }, save_mean{ save_mean_t, "save_mean", 4 }, save_var{ save_var_t, "save_var", 5 }; @@ -195,7 +197,9 @@ std::tuple miopen_batch_norm_backward( } checkAllSameType(c, {input, grad_output}); checkAllSameType(c, {weight, save_mean, save_var}); - checkAllContiguous(c, {input, grad_output, save_mean, save_var}); + checkAllContiguous(c, {save_mean, save_var}); + TORCH_CHECK(input->is_contiguous(input->suggest_memory_format())); + TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format())); checkDimRange(c, input, 2, 6 /* exclusive */); checkSameSize(c, input, grad_output); auto num_features = input->size(1); @@ -210,7 +214,8 @@ std::tuple miopen_batch_norm_backward( mode = miopenBNSpatial; } - auto grad_input_t = at::empty(input->sizes(), input->options()); + auto grad_input_t = at::empty( + input->sizes(), input->options(), input->suggest_memory_format()); auto grad_weight_t = at::empty(weight->sizes(), weight->options()); auto grad_bias_t = at::empty(weight->sizes(), weight->options()); diff --git a/test/test_nn.py b/test/test_nn.py index 2ece5fbdbd72c..3adc5a5e2e3a2 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -8,6 +8,7 @@ import io import itertools import warnings +import os import pickle import re from copy import deepcopy @@ -30,12 +31,12 @@ from torch.nn import Buffer, Parameter from torch.nn.parallel._functions import Broadcast from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types -from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ +from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \ download_file, get_function_arglist, load_tests, skipIfMPS, \ IS_PPC, \ parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \ - skipIfTorchDynamo, gcIfJetson, set_default_dtype + skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \ module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \ @@ -5136,7 +5137,174 @@ def test_batchnorm_nhwc_cuda(self): inp2 = inp1.contiguous(memory_format=torch.channels_last) out1 = model(inp1) out2 = model(inp2) - self.assertTrue(torch.equal(out1, out2)) + self.assertEqual(out1, out2) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x) + @parametrize_test( + # test verifies cudnn/miopen batchnorm with the reference backend or memory format + # memory_format - one of ("NCHW", NHWC") + # ref_backend - one of ("cpu", "native", "NCHW", "NHWC") + # "cpu" - cpu backend with the same memory_format will be used as reference + # "native" - native backend (`with torch.backends.cudnn.flags(enabled=False)`) + # with the same memory_format will be used + # "NCHW" or "NHWC" - the same backend will be used but another memory format + # mixed - True or False. Mixed batchnorm mode where inputs are 16-bit and batchnorm is fp32 + # + "memory_format,ref_backend,mixed,dtype", + [ + ("NCHW", "cpu", False, torch.float), + ("NCHW", "cpu", True, torch.half), + # NCHW bfloat16 path uses native kernels for rocm<=6.3 + # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600 + subtest(("NCHW", "cpu", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]), + + ("NCHW", "native", False, torch.float), + ("NCHW", "native", True, torch.half), + # this config failed for train and passed for inference on ROCm6.4 + # subtest(("NCHW", "native", True, torch.bfloat16), decorators=[unittest.expectedFailure]), + + ("NHWC", "cpu", False, torch.float), + ("NHWC", "cpu", True, torch.half), + ("NHWC", "cpu", True, torch.bfloat16), + + ("NHWC", "native", False, torch.float), + ("NHWC", "native", True, torch.half), + ("NHWC", "native", True, torch.bfloat16), + + ("NHWC", "NCHW", False, torch.float), + ("NHWC", "NCHW", True, torch.half), + # NCHW bfloat16 path uses native kernels for rocm<=6.3 + # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600 + subtest(("NHWC", "NCHW", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]), + ], + name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}" + ) + def test_batchnorm(self, mode, memory_format, ref_backend, mixed, dtype): + def _create_tensor(size, memory_format, dtype, device): + t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device) + t = t.random_(1, 10) + return t + + def _get_ref_device(backend: str , device: str): + # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend + if backend in ("NHWC", "NCHW"): + return device + if backend == "native": + return "cuda" + if backend == "cpu": + return "cpu" + else: + raise ValueError("Unknown backend") + + def _get_backend_memory_format(backend: str, memory_format: torch.memory_format) -> torch.memory_format: + # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg + if backend == "NHWC": + return torch.channels_last + if backend == "NCHW": + return torch.contiguous_format + if memory_format in (torch.contiguous_format, torch.channels_last): + return memory_format + raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}") + + def _get_memory_format(t: torch.Tensor) -> torch.memory_format: + if t.is_contiguous(memory_format=torch.contiguous_format): + return torch.contiguous_format + if t.is_contiguous(memory_format=torch.channels_last): + return torch.channels_last + return ValueError("Unsupported memory_format") + + def _create_backend(inp: torch.Tensor, mixed: bool = False): + mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) + return mod + + def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend): + mod = _create_backend(inp, mixed).train() + mod.weight.data.uniform_() + mod.bias.data.uniform_() + + ref_mod = _create_backend(ref_inp, mixed).train() + ref_mod.load_state_dict(mod.state_dict()) + + out = mod(inp) + out.backward(grad) + + with torch.backends.cudnn.flags(enabled=False) if ref_backend == "native" else contextlib.nullcontext(): + ref_out = ref_mod(ref_inp) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=_get_memory_format(inp))) + self.assertTrue(ref_out.is_contiguous(memory_format=_get_memory_format(ref_inp))) + self.assertEqual(out, ref_out) + self.assertEqual(mod.weight.grad, ref_mod.weight.grad) + self.assertEqual(mod.bias.grad, ref_mod.bias.grad) + self.assertEqual(mod.running_mean, ref_mod.running_mean) + self.assertEqual(mod.running_var, ref_mod.running_var) + self.assertEqual(inp.grad, ref_inp.grad) + + def _train(memory_format, ref_backend, mixed, dtype): + memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last + ref_memory_format = _get_backend_memory_format(ref_backend, memory_format) + ref_device = _get_ref_device(ref_backend, device="cuda") + + size = (4, 8, 2, 2) + inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_() + grad = _create_tensor(size, memory_format, dtype, device="cuda") + ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_() + ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device) + + _test_batchnorm_train(inp=inp, grad=grad, mixed=mixed, + ref_inp=ref_inp, ref_grad=ref_grad, ref_backend=ref_backend) + + # TODO: enable permute logic later + # size = (2, 8, 8, 1) + # input = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_() + # grad = _create_tensor(size, memory_format=torch.contiguous_format, dtype=dtype, device="cuda") + # # grad = _create_tensor(size, memory_format=memory_format, dtype=dtype, device="cuda") + + # ref_input = input.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_(True) + # ref_grad = grad.detach().clone(memory_format=torch.contiguous_format).to(device=ref_device) + # # ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device) + + # if memory_format == torch.channels_last: + # grad = grad.permute(0, 2, 1, 3) + # # grad = grad.permute(0, 2, 3, 1) + # if ref_memory_format == torch.channels_last: + # ref_grad = ref_grad.permute(0, 2, 1, 3) + # # ef_grad = ref_grad.permute(0, 2, 3, 1) + # _test_batchnorm_train(input=input, grad=grad, mixed=mixed, + # ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend) + + def _inference(memory_format, ref_backend, mixed, dtype): + memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last + ref_memory_format = _get_backend_memory_format(ref_backend, memory_format) + ref_device = _get_ref_device(ref_backend, device="cuda") + + size = (2, 64, 50, 50) + inp = _create_tensor(size, memory_format, dtype, device="cuda") + ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device) + mod = _create_backend(inp, mixed).eval() + ref_mod = _create_backend(ref_inp, mixed).eval() + + out = mod(inp) + with torch.backends.cudnn.flags(enabled=False) if ref_backend == "native" else contextlib.nullcontext(): + ref_out = ref_mod(ref_inp) + self.assertEqual(out, ref_out) + + # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen + PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = "PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM" + prev_val = os.getenv(PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) + try: + os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] = "1" + if mode == "train": + _train(memory_format, ref_backend, mixed, dtype) + else: + _inference(memory_format, ref_backend, mixed, dtype) + finally: + if prev_val is None: + del os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] + else: + os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] = prev_val def test_batchnorm_load_state_dict(self): bn = torch.nn.BatchNorm2d(3) @@ -8377,7 +8545,6 @@ def test_affine_3d_rotateRandom(self, device): self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary)) - @onlyCUDA @dtypes(torch.float, torch.half) def test_batchnorm_large_batch(self, device, dtype): diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index e2419aab268b1..d711480bb85ef 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -2793,7 +2793,7 @@ self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) - input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()" + input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(input.suggest_memory_format()), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()" result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon) - name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor) From fb20451c34bab48f9c4a94a0bf102f9dfa13ff4b Mon Sep 17 00:00:00 2001 From: iupaikov-amd Date: Tue, 13 May 2025 18:52:32 +0200 Subject: [PATCH 15/33] test_decompose_mem_bound_mm.py tolerance increase for navi3x (cherry picked from commit 03c7da05f61890bbf5ae41e23c8df6d5f6805bac) --- test/inductor/test_decompose_mem_bound_mm.py | 45 ++++++++++++++++---- torch/testing/_internal/common_utils.py | 37 +++++++++++++++- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py index d21de3178cf1e..828d05738739f 100644 --- a/test/inductor/test_decompose_mem_bound_mm.py +++ b/test/inductor/test_decompose_mem_bound_mm.py @@ -12,6 +12,8 @@ from torch.testing import FileCheck from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, + patch_test_members, + is_navi3_arch, parametrize, TEST_XPU, ) @@ -61,31 +63,46 @@ def forward(self, input1, input2): ) @instantiate_parametrized_tests class TestDecomposeMemMM(TestCase): - def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3): + def __init__(self, method_name='runTest', methodName='runTest'): + super().__init__(method_name, methodName) + self.atol = 1e-3 + self.rtol = 1e-3 + + def setup_tolerance(self, rtol=None, atol=None): + if rtol is None: + rtol = self.rtol + if atol is None: + atol = self.rtol + + def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None): + self.setup_tolerance(rtol, atol) if len(set(ref_dict.keys())) != len(set(res_dict.keys())): return False for key1 in ref_dict.keys(): key2 = "_orig_mod." + key1 assert key2 in res_dict, f"{key1} does not exist in traced module" - if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol): + if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol): return False return True - def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3): + def compare_pred(self, module, traced, input, rtol=None, atol=None): + self.setup_tolerance(rtol, atol) ref = module(*input) res = traced(*input) - self.assertEqual(ref, res, rtol=rtol, atol=atol) + self.assertEqual(ref, res, rtol=self.rtol, atol=self.atol) - def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3): + def compare_parameters(self, module, traced, rtol=None, atol=None): + self.setup_tolerance(rtol, atol) ref_params = dict(module.named_parameters()) res_params = dict(traced.named_parameters()) - self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol)) + self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol=self.rtol, atol=self.atol)) - def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3): + def compare_gradients(self, module, traced, rtol=None, atol=None): + self.setup_tolerance(rtol, atol) ref_grad = {key: param.grad for key, param in module.named_parameters()} res_grad = {key: param.grad for key, param in traced.named_parameters()} self.assertTrue( - self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol) + self.compare_dict_tensors(ref_grad, res_grad, rtol=self.rtol, atol=self.atol) ) @parametrize( @@ -192,6 +209,12 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose): ) counters.clear() + # We have to increase tolerance for navi3 because all fp16, bf16 + # GEMMs operations have an accuracy issue caused by hardware limitation + @patch_test_members({ + "atol": 2e-3 if is_navi3_arch() else 1e-3, + "rtol": 2e-3 if is_navi3_arch() else 1e-3 + }) @parametrize( "m,k,n, should_decompose", [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)], @@ -302,6 +325,12 @@ def test_decompose_mm_cpu(self, m, n, k, should_decompose): ) counters.clear() + # We have to increase tolerance for navi3 because all fp16, bf16 + # GEMMs operations have an accuracy issue caused by hardware limitation + @patch_test_members({ + "atol": 3e-3 if is_navi3_arch() else 1e-3, + "rtol": 4e-3 if is_navi3_arch() else 1e-3 + }) @parametrize( "m,k,n, should_decompose", [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)], diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 45b7378f88cc8..e2bd47fe62000 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -102,8 +102,18 @@ has_pytest = False -MI300_ARCH = ("gfx942",) - +MI300_ARCH = ("gfx940", "gfx941", "gfx942") +NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201") +NAVI3_ARCH = ("gfx1100", "gfx1101") +NAVI4_ARCH = ("gfx1200", "gfx1201") + +def is_navi3_arch(): + if torch.cuda.is_available(): + prop = torch.cuda.get_device_properties(0) + gfx_arch = prop.gcnArchName.split(":")[0] + if gfx_arch in NAVI3_ARCH: + return True + return False def freeze_rng_state(*args, **kwargs): return torch.testing._utils.freeze_rng_state(*args, **kwargs) @@ -5710,3 +5720,26 @@ def load_inline(*args, **kwargs): return func(*args, load_inline=load_inline, **kwargs) return wrapper + +# Decorator to patch multiple test class members for the duration of the subtest +def patch_test_members(updates: Dict[str, Any]): + def decorator(test_func): + @wraps(test_func) + def wrapper(self, *args, **kwargs): + # Store the original values of the specified members + original_values = {member: getattr(self, member) for member in updates} + + # Update the members before running the subtest + for member, value in updates.items(): + setattr(self, member, value) + + # Run the test function, allowing subtests to run + try: + return test_func(self, *args, **kwargs) + finally: + # Restore the original values of the specified members after the subtest finishes + for member, original_value in original_values.items(): + setattr(self, member, original_value) + + return wrapper + return decorator \ No newline at end of file From 32449c9a7af659e157423ab834330142d870254d Mon Sep 17 00:00:00 2001 From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> Date: Thu, 22 May 2025 22:35:04 +0200 Subject: [PATCH 16/33] [release/2.7] enable NHWC batchnorm by default on ROCm7.0+ (#2180) NHWC batchnorm enabled by default if ROCm>=7.0 (cherry picked from commit e0afc3acaab703224e0faf9f54fa3ed39cb16e2b) --- aten/src/ATen/native/Normalization.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 3f473a7146453..6021f9d7edf1e 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -523,7 +523,8 @@ BatchNormBackend _select_batch_norm_backend( // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen // See #64427 // non static variable is used to be able to change environment variable in runtime for testing - bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(false); + // enabled by default for ROCm >= 7.0.0 + bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000); if ( detail::getCUDAHooks().compiledWithMIOpen() From 23f0b5f3b7870eb689b72f3de28eb10964c29bed Mon Sep 17 00:00:00 2001 From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> Date: Sat, 24 May 2025 04:51:06 +0200 Subject: [PATCH 17/33] [release/2.7] import 'Dict' to fix common_utils.py (#2181) Bug introduced by https://github.com/ROCm/pytorch/commit/03c7da05f61890bbf5ae41e23c8df6d5f6805bac (cherry picked from commit bbd0112a2cfbefbf687f5e907766aba4d030258e) --- torch/testing/_internal/common_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index e2bd47fe62000..052a968d51e22 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -49,6 +49,7 @@ from typing import ( Any, Callable, + Dict, Optional, TypeVar, Union, From 48630d8997007790924d82f117b91c43be6c4ab0 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Thu, 29 May 2025 00:33:36 -0500 Subject: [PATCH 18/33] [AUTOGENERATED] [release/2.7] [rocm6.4_internal_testing] Replaced ROCm specific skips to generalized conditions (#2126) Cherry-pick of https://github.com/ROCm/pytorch/pull/2100 Need to resolve conflicts --------- Co-authored-by: iupaikov-amd (cherry picked from commit f0c1ce8cd1232adc80fae55aa054626ad0d2b430) --- test/inductor/test_aot_inductor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py index 9f93a3959d831..6a768a3dbbb15 100644 --- a/test/inductor/test_aot_inductor.py +++ b/test/inductor/test_aot_inductor.py @@ -31,7 +31,11 @@ from torch.export.pt2_archive._package import load_pt2 from torch.testing import FileCheck from torch.testing._internal import common_utils -from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM80OrLater +from torch.testing._internal.common_cuda import ( + SM80OrLater, + SM90OrLater, + PLATFORM_SUPPORTS_FLASH_ATTENTION +) from torch.testing._internal.common_device_type import ( _has_sufficient_memory, skipCUDAIf, @@ -1363,6 +1367,7 @@ def forward(self, q, k, v): self.check_model(Model(), example_inputs) @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+") + @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") def test_sdpa_2(self): class Model(torch.nn.Module): def __init__(self) -> None: @@ -1615,6 +1620,7 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst): self.check_model(Repro(), example_inputs, dynamic_shapes=spec) @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet") + @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") def test_fallback_kernel_with_symexpr_output(self): if self.device != GPU_TYPE: raise unittest.SkipTest("requires GPU") @@ -4173,6 +4179,7 @@ def grid(meta): dynamic_shapes=dynamic_shapes, ) + @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") def test_scaled_dot_product_efficient_attention(self): if self.device != GPU_TYPE: raise unittest.SkipTest("requires GPU") From ae17c3a98ffe389c67c3258408cdca277d77c753 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Tue, 3 Jun 2025 19:20:18 -0500 Subject: [PATCH 19/33] [release/2.7] [SWDEV-535259] enable miopen channels last 3d for conv and batchnorm (#2232) Cherry-pick of https://github.com/ROCm/pytorch/pull/2209 Co-authored-by: Jeff Daily (cherry picked from commit bf0079dce7c9fb5acb4d1ab0b3219c6a415302e7) --- aten/src/ATen/native/ConvUtils.h | 16 ++++++++++------ aten/src/ATen/native/Convolution.cpp | 2 +- aten/src/ATen/native/Normalization.cpp | 1 + aten/src/ATen/native/miopen/Conv_miopen.cpp | 12 ++++++------ 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 6e99e9565240c..84381efe55b0b 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -362,20 +362,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten return false; } - bool can_use_miopen_channels_last_2d = false; // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen // See #64427 static std::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); + static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC; auto input_memory_format = input.suggest_memory_format(); auto weight_memory_format = weight.suggest_memory_format(); + auto weight_ndim = weight.ndimension(); - can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC && ( - ( (input_memory_format == at::MemoryFormat::ChannelsLast) || - (weight_memory_format == at::MemoryFormat::ChannelsLast) ) - ); + bool can_use_miopen_channels_last_2d = suggest_nhwc && (weight_ndim == 4) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) + ); - bool can_use_miopen_channels_last_3d = false; + bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d) + ); return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d; } diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index d06fc5168a0fd..1122d9c8d38af 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -1421,7 +1421,7 @@ static inline at::MemoryFormat determine_backend_memory_format( if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) { TORCH_INTERNAL_ASSERT((k == 4 || k == 5), "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()"); - backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast; + backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } break; case ConvBackend::Mkldnn: diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 6021f9d7edf1e..b9cf3c907f09f 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -546,6 +546,7 @@ BatchNormBackend _select_batch_norm_backend( && (input.suggest_memory_format() == MemoryFormat::Contiguous #if (defined(USE_ROCM) && ROCM_VERSION >= 60500) || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) + || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM) #endif ) ) { diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index d2cef0ca62188..f4e67e4fc307a 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -763,7 +763,7 @@ Tensor miopen_convolution_forward( auto memory_format = at::MemoryFormat::Contiguous; if (miopen_conv_use_channels_last(*input, *weight)) { - memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } Tensor output_t = at::detail::empty_cuda( @@ -872,7 +872,7 @@ Tensor miopen_depthwise_convolution_forward( auto memory_format = at::MemoryFormat::Contiguous; if (miopen_conv_use_channels_last(*input, *weight)) { - memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } Tensor output_t = at::detail::empty_cuda( @@ -1074,7 +1074,7 @@ Tensor miopen_depthwise_convolution_backward_weight( auto memory_format = at::MemoryFormat::Contiguous; if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } Tensor grad_output_contig_t = grad_output->contiguous(memory_format); @@ -1127,7 +1127,7 @@ Tensor miopen_convolution_backward_weight( auto memory_format = at::MemoryFormat::Contiguous; if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } Tensor grad_output_contig_t = grad_output->contiguous(memory_format); @@ -1281,7 +1281,7 @@ Tensor miopen_convolution_backward_input( auto memory_format = at::MemoryFormat::Contiguous; if (miopen_conv_use_channels_last(*grad_output, *weight)) { - memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } Tensor grad_input_t = at::detail::empty_cuda( @@ -1389,7 +1389,7 @@ Tensor miopen_depthwise_convolution_backward_input( auto memory_format = at::MemoryFormat::Contiguous; if (miopen_conv_use_channels_last(*grad_output, *weight)) { - memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; + memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; } Tensor grad_input_t = at::detail::empty_cuda( From e4d62b137876a53c00324e967d23f91f3f9837ac Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Wed, 4 Jun 2025 18:33:11 -0500 Subject: [PATCH 20/33] [AUTOGENERATED] [release/2.7] Add 3D batchnorm tests (#2243) Cherry-pick of https://github.com/ROCm/pytorch/pull/2214 Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> Co-authored-by: Jeff Daily (cherry picked from commit 5631e0791910c7ea40e81027fa6de7b5e0e24253) --- test/test_nn.py | 75 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/test/test_nn.py b/test/test_nn.py index 3adc5a5e2e3a2..14d4eed971d2e 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -37,7 +37,7 @@ IS_PPC, \ parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \ skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype -from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION +from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \ module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \ ctcloss_reference, get_new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input @@ -5140,6 +5140,7 @@ def test_batchnorm_nhwc_cuda(self): self.assertEqual(out1, out2) @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + @parametrize_test("dims", [2, 3], name_fn=lambda x: f"{x}D") @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x) @parametrize_test( # test verifies cudnn/miopen batchnorm with the reference backend or memory format @@ -5155,14 +5156,11 @@ def test_batchnorm_nhwc_cuda(self): [ ("NCHW", "cpu", False, torch.float), ("NCHW", "cpu", True, torch.half), - # NCHW bfloat16 path uses native kernels for rocm<=6.3 - # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600 - subtest(("NCHW", "cpu", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]), + ("NCHW", "cpu", True, torch.bfloat16), ("NCHW", "native", False, torch.float), ("NCHW", "native", True, torch.half), - # this config failed for train and passed for inference on ROCm6.4 - # subtest(("NCHW", "native", True, torch.bfloat16), decorators=[unittest.expectedFailure]), + ("NCHW", "native", True, torch.bfloat16), ("NHWC", "cpu", False, torch.float), ("NHWC", "cpu", True, torch.half), @@ -5174,13 +5172,33 @@ def test_batchnorm_nhwc_cuda(self): ("NHWC", "NCHW", False, torch.float), ("NHWC", "NCHW", True, torch.half), - # NCHW bfloat16 path uses native kernels for rocm<=6.3 - # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600 - subtest(("NHWC", "NCHW", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]), + ("NHWC", "NCHW", True, torch.bfloat16), ], name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}" ) - def test_batchnorm(self, mode, memory_format, ref_backend, mixed, dtype): + def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype): + if torch.version.hip: + if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16", + "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16", + "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16", + "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16" + ) and _get_torch_rocm_version() < (6, 4): + # NCHW bfloat16 path uses native kernels for rocm<=6.3 + # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600 + self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3") + + if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16", + "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16" + ) and _get_torch_rocm_version() >= (6, 4): + self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600") + + if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16" \ + and _get_torch_rocm_version() < (6, 4): + self.skipTest("3D float16 NCHW train failed on ROCm<=6.3 ") + + if dims == 3 and memory_format in ("NHWC", "NCHW"): + memory_format = memory_format + "3D" + def _create_tensor(size, memory_format, dtype, device): t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device) t = t.random_(1, 10) @@ -5188,7 +5206,7 @@ def _create_tensor(size, memory_format, dtype, device): def _get_ref_device(backend: str , device: str): # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend - if backend in ("NHWC", "NCHW"): + if backend in ("NHWC", "NHWC3D", "NCHW", "NCHW3D"): return device if backend == "native": return "cuda" @@ -5201,9 +5219,11 @@ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format) # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg if backend == "NHWC": return torch.channels_last - if backend == "NCHW": + if backend == "NHWC3D": + return torch.channels_last_3d + if backend in ("NCHW", "NCHW3D"): return torch.contiguous_format - if memory_format in (torch.contiguous_format, torch.channels_last): + if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d): return memory_format raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}") @@ -5212,10 +5232,24 @@ def _get_memory_format(t: torch.Tensor) -> torch.memory_format: return torch.contiguous_format if t.is_contiguous(memory_format=torch.channels_last): return torch.channels_last + if t.is_contiguous(memory_format=torch.channels_last_3d): + return torch.channels_last_3d + return ValueError("Unsupported memory_format") + + def _get_memory_format_from_name(memory_format_name: str) -> torch.memory_format: + if memory_format_name == "NHWC": + return torch.channels_last + elif memory_format_name == "NHWC3D": + return torch.channels_last_3d + elif memory_format_name in ("NCHW", "NCHW3D"): + return torch.contiguous_format return ValueError("Unsupported memory_format") def _create_backend(inp: torch.Tensor, mixed: bool = False): - mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) + + mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) \ + if inp.dim() == 4 else \ + nn.BatchNorm3d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) return mod def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend): @@ -5242,12 +5276,13 @@ def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend): self.assertEqual(mod.running_var, ref_mod.running_var) self.assertEqual(inp.grad, ref_inp.grad) - def _train(memory_format, ref_backend, mixed, dtype): - memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last + def _train(memory_format_name, ref_backend, mixed, dtype): + memory_format = _get_memory_format_from_name(memory_format_name) + ref_memory_format = _get_backend_memory_format(ref_backend, memory_format) ref_device = _get_ref_device(ref_backend, device="cuda") - size = (4, 8, 2, 2) + size = (4, 8, 2, 2, 2) if memory_format_name in ("NCHW3D", "NHWC3D") else (4, 8, 2, 2) inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_() grad = _create_tensor(size, memory_format, dtype, device="cuda") ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_() @@ -5275,12 +5310,12 @@ def _train(memory_format, ref_backend, mixed, dtype): # _test_batchnorm_train(input=input, grad=grad, mixed=mixed, # ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend) - def _inference(memory_format, ref_backend, mixed, dtype): - memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last + def _inference(memory_format_name, ref_backend, mixed, dtype): + memory_format = _get_memory_format_from_name(memory_format_name) ref_memory_format = _get_backend_memory_format(ref_backend, memory_format) ref_device = _get_ref_device(ref_backend, device="cuda") - size = (2, 64, 50, 50) + size = (2, 64, 50, 50, 50) if memory_format_name in ("NCHW3D", "NHWC3D") else (2, 64, 50, 50) inp = _create_tensor(size, memory_format, dtype, device="cuda") ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device) mod = _create_backend(inp, mixed).eval() From d40f3c8babe65790a438c55bf3bba909c09b186a Mon Sep 17 00:00:00 2001 From: rocm-mici <49319081+rocm-mici@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:35:55 -0600 Subject: [PATCH 21/33] [AUTOGENERATED] [release/2.5] [ROCm][layer_norm] Use __builtin_amdgcn_rcpf(x) instead of 1.f/x (#1800) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-pick of https://github.com/ROCm/pytorch/pull/1688 Co-authored-by: Michael Halkenhäuser Co-authored-by: Hashem Hashemi (cherry picked from commit f8544afa6d17de9cc63b6ff28b4b32cf0c1a4381) (cherry picked from commit ed487541c625699693d7ba025593bb41f555968b) (cherry picked from commit d62a39eb77f5d7c922e93f3292ef7e072cdf6f84) --- aten/src/ATen/native/cuda/layer_norm_kernel.cu | 8 ++++++++ cmake/Dependencies.cmake | 16 ++++++++++++++++ setup.py | 4 ++++ 3 files changed, 28 insertions(+) diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 0709164e60555..aa25ad5f6ee0c 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -131,7 +131,11 @@ WelfordDataLN cuWelfordOnlineSum( { U delta = val - curr_sum.mean; U new_count = curr_sum.count + 1.f; +#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count); +#else U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster +#endif return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count}; } @@ -145,7 +149,11 @@ WelfordDataLN cuWelfordCombine( U count = dataA.count + dataB.count; U mean, sigma2; if (count > decltype(dataB.count){0}) { +#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + auto coef = __builtin_amdgcn_rcpf(count); +#else auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division +#endif auto nA = dataA.count * coef; auto nB = dataB.count * coef; mean = nA*dataA.mean + nB*dataB.mean; diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a93386c27f8d8..1a06f2915787d 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1048,6 +1048,22 @@ if(USE_ROCM) list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) endif(CMAKE_BUILD_TYPE MATCHES Debug) + # Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on). + if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL}) + set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL}) + else() + set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON) + endif() + + set(PYTORCH_LAYERNORM_FAST_RECIPROCAL + ${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE} + CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE + ) + + if(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL) + endif() + # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17) diff --git a/setup.py b/setup.py index b4ebc92f59268..4db59ecd0b087 100644 --- a/setup.py +++ b/setup.py @@ -153,6 +153,10 @@ # USE_ROCM_KERNEL_ASSERT=1 # Enable kernel assert in ROCm platform # +# PYTORCH_LAYERNORM_FAST_RECIPROCAL +# If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t. +# layer normalization. Default: enabled. +# # Environment variables we respect (these environment variables are # conventional and are often understood/set by other software.) # From dbb9f2a1d56edf6f738ee3afd61a6673b7d4ae1a Mon Sep 17 00:00:00 2001 From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> Date: Wed, 30 Apr 2025 19:11:29 +0200 Subject: [PATCH 22/33] [release/2.6] remove xfail from 'batch_norm_with_update' (#2070) remove `xfail` from `batch_norm_with_update` op in `test_grad` and `test_vmap_autograd_grad` these tests are passed since ROCm6.4 Fixes https://ontrack-internal.amd.com/browse/SWDEV-529820 (cherry picked from commit 99b075842070e2b73625440d8712b87e1833515c) (cherry picked from commit a7044a4005eeb6d60cdc77a23e629a63b977eb1a) (cherry picked from commit 3fc00a85a28c854ffc4832551d720e839c48700c) --- test/functorch/test_ops.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py index cef00f83eb72d..244d8518c6a10 100644 --- a/test/functorch/test_ops.py +++ b/test/functorch/test_ops.py @@ -436,13 +436,6 @@ class TestOperators(TestCase): ), # Works on ROCm xfail("torch.ops.aten._flash_attention_forward"), xfail("torch.ops.aten._efficient_attention_forward"), - # RuntimeError: Expected contiguous tensor, but got - # non-contiguous tensor for argument #2 'grad_output' - decorate( - "_batch_norm_with_update", - decorator=expectedFailureIf(TEST_WITH_ROCM), - device_type="cuda", - ), } ), ) @@ -2368,13 +2361,6 @@ def fn(input, weight, bias): skip("sparse.sampled_addmm", ""), skip("sparse.mm", "reduce"), skip("native_layer_norm", "", device_type="cpu"), - # RuntimeError: Expected contiguous tensor, but got - # non-contiguous tensor for argument #2 'grad_output' - decorate( - "_batch_norm_with_update", - decorator=expectedFailureIf(TEST_WITH_ROCM), - device_type="cuda", - ), }, ) @opsToleranceOverride( From e62e3947b3b80e90f3e70261d045809de5444e08 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Wed, 4 Jun 2025 09:40:52 -0700 Subject: [PATCH 23/33] [release/2.7] Enable mx fp8 support on ROCm (#2199) Ported mx fp8 part from https://github.com/ROCm/pytorch/pull/2046 Current test stats (accounting only blockwise scale tests) PYTORCH_TEST_WITH_ROCM=1 python test/test_matmul_cuda.py -k test_blockwise -v Ran 225 tests in 8.256s FAILED (failures=1, skipped=150) _74 test pass_ **fp8 mx data type sample test case.** test_blockwise_mxfp8_numerics_test_case_name_data_random_scales_one_fast_accum_True_512_128_256_cuda (__main__.TestFP8MatmulCudaCUDA) hipblaslt-bench --api_method c -m 256 -n 512 -k 128 --lda 128 --ldb 128 --ldc 256 --ldd 256 --stride_a 0 --stride_b 0 --stride_c 0 --stride_d 0 --alpha 1 --beta 0 --transA T --transB N --batch_count 1 --scaleA 3 --scaleB 3 --a_type f8_r --b_type f8_r --c_type bf16_r --d_type bf16_r --compute_type f32_r --algo_method index --solution_index -2146957310 --rotating 0 --cold_iters 0 --iters 0 --------- Signed-off-by: Jagadish Krishnamoorthy (cherry picked from commit d17e2227ecfdf81b3684aa034113a747928ea6cb) --- aten/src/ATen/cuda/CUDABlas.cpp | 14 +++++++-- aten/src/ATen/native/cuda/Blas.cpp | 36 ++++++++++++++++++++-- test/test_matmul_cuda.py | 8 +++-- torch/testing/_internal/common_cuda.py | 10 +++++- torch/utils/hipify/cuda_to_hip_mappings.py | 5 +++ 5 files changed, 65 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 89350a11bea7e..d884fd7ffa915 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -1879,6 +1879,16 @@ void scaled_gemm( matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT; matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT; } + else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) { +#if ROCM_VERSION >= 70000 + if (at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"})) { + // Validate matrix dimensions for MX format + TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0), + "Matrix dimensions must be multiples of 32 for MX format. ", + "Got m=", m, ", n=", n, ", k=", k); + } +#endif + } #else // rowwise isn't supported using older hipblaslt TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt"); @@ -1917,11 +1927,11 @@ void scaled_gemm( } if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) { -#if CUDA_VERSION >= 12080 +#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000) computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0); #else - TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above"); + TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above"); #endif // if CUDA_VERSION >= 12080 } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) { #if CUDA_VERSION >= 12080 diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 1834839bb6e85..21e6f9f65dd70 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -1133,12 +1133,15 @@ ScalingType get_scaling_type( auto expected_b_size = BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks; + //TODO: enable the checks for ROCm +#ifndef USE_ROCM TORCH_CHECK(scale_a.numel() == expected_a_size, "For BlockWise scaling: Expected scale_a size to be ", expected_a_size, " but got ", scale_a.numel()); TORCH_CHECK(scale_b.numel() == expected_b_size, "For BlockWise scaling: Expected scale_b size to be ", expected_b_size, " but got ", scale_b.numel()); +#endif TORCH_CHECK( scale_a.is_contiguous() && scale_b.is_contiguous(), @@ -1205,6 +1208,7 @@ ScalingType get_scaling_type( } // namespace + // Computes matrix multiply + bias while applying scaling to input and output matrices // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default. // If output matrix type is 16 or 32-bit type, scale_result is not applied. @@ -1268,6 +1272,14 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, // Type restrictions imposed by CuBLASLt as of CUDA-12.1 TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2, "Multiplication of two Float8_e5m2 matrices is not supported"); +#endif +#ifdef USE_ROCM + if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) { + TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above"); + } + if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) { + TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above"); + } #endif if (use_fast_accum) { TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype."); @@ -1327,7 +1339,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, } #else if (scaling_choice == ScalingType::RowWise) { - // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes. + // For ROCm, match behavior of f8f8bf16_rowwise type checking Tensor b = mat2; if (_scaled_mm_is_fnuz()) { TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz); @@ -1335,9 +1347,25 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, else { TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn); } - // Until more than bf16 is supported. + // Until more than bf16 is supported TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16, - "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type()); + "hipblaslt rowwise _scaled_mm only supports BFloat16 output"); + } + else if (scaling_choice == ScalingType::BlockWise) { +#if ROCM_VERSION >= 70000 + TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}, 0), + "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); + + TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 && + mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0, + "Matrix dimensions must be multiples of 32 for block-wise scaling"); + + TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 || + out.scalar_type() == ScalarType::Half, + "Block-wise scaling only supports BFloat16 or Half output types"); +#else + TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); +#endif } #endif @@ -1416,10 +1444,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, params.k = args.k; params.a = args.mata->data_ptr(); params.a_scale_ptr = args.scale_mata_ptr; + params.a_scale_dtype = scale_a.scalar_type(); params.lda = args.lda; params.a_dtype = args.mata->scalar_type(); params.b = args.matb->data_ptr(); params.b_scale_ptr = args.scale_matb_ptr; + params.b_scale_dtype = scale_b.scalar_type(); params.ldb = args.ldb; params.b_dtype = args.matb->scalar_type(); params.bias_ptr = bias ? bias->data_ptr(): nullptr; diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py index e8b4d9092cdd0..8ec832e40a163 100644 --- a/test/test_matmul_cuda.py +++ b/test/test_matmul_cuda.py @@ -1453,6 +1453,10 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r device = "cuda" M, K, N = mkn + if torch.version.hip: + if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0): + raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping") + if recipe == "nvfp4" and K % 32 != 0: return unittest.skip("K must be divisible by 32 for nvfp4 cublas gemm, skipping") @@ -1462,7 +1466,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r if test_case_name == "a_eye_b_eye": if not ((M == K) and (M == N)): - return unittest.skip("this test is only defined for M == K == N, skipping") + raise unittest.SkipTest("this test is only defined for M == K == N, skipping") A_ref = torch.eye(M, device=device, dtype=torch.bfloat16) B_ref = torch.eye(M, device=device, dtype=torch.bfloat16) @@ -1601,7 +1605,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r elif test_case_name == "data_random_scales_from_data": if not K % BLOCK_SIZE == 0: - return unittest.skip(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping") + raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping") require_exact_match = False # random data, scales from data A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000 diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index a211851d671fa..2620c64a95ef1 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -108,7 +108,15 @@ def evaluate_platform_supports_fp8(): PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8()) -PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater) +def _platform_supports_mx_gemm(): + if torch.cuda.is_available(): + if torch.version.hip: + return 'gfx95' in torch.cuda.get_device_properties(0).gcnArchName + else: + return SM100OrLater + return False + +PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm()) if TEST_NUMBA: try: diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index a5145a2f4870a..b251a85e245a5 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -3870,6 +3870,7 @@ ("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)), ("CUDA_R_8F_E4M3", ("HIP_R_8F_E4M3", CONV_TYPE, API_RUNTIME)), ("CUDA_R_8F_E5M2", ("HIP_R_8F_E5M2", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_4F_E2M1", ("HIP_R_4F_E2M1", CONV_TYPE, API_RUNTIME)), ( "MAJOR_VERSION", ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), @@ -7347,6 +7348,10 @@ ("CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", ("HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", CONV_MATH_FUNC, API_BLAS)), ("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS)), ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)), + ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)), + ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)), + ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)), + ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)), ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)), ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)), ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)), From e0160f1b99ba589db23d421697a4b861038c485b Mon Sep 17 00:00:00 2001 From: Andres Lugo <108368282+alugorey@users.noreply.github.com> Date: Tue, 1 Apr 2025 22:03:37 -0500 Subject: [PATCH 24/33] Extend CK gemm/sdpa support to gfx950 (#45) Update CK for gfx950 (#49) (cherry picked from commit 8ccfc47420df0a17b79e351ef240e743ebfd9a77) (cherry picked from commit b5d5987eb6d4eaf6577e7a9f80d53fa9a3f79177) --- aten/src/ATen/Context.cpp | 2 +- .../src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index fd346b2d9af00..fefcf731cd825 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -419,7 +419,7 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) { if(b == at::ROCmFABackend::Ck) { static const bool ck_unsupported = []() { static const std::vector archs = { - "gfx90a", "gfx942" + "gfx90a", "gfx942", "gfx950" }; for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { if (!detail::getCUDAHooks().isGPUArch(archs, index)) { diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp index 38ec2ef20c5cc..affa40619b598 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp +++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp @@ -453,4 +453,5 @@ struct fmha_bwd_traits bool is_deterministic; // TODO: padding check is inside this api }; +template float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&); From 08390c7ce36e76624a0a1c6f503313c783056f9c Mon Sep 17 00:00:00 2001 From: pmaybank <113125070+pmaybank@users.noreply.github.com> Date: Thu, 29 May 2025 16:41:38 +0100 Subject: [PATCH 25/33] [release/2.6] [SWDEV-529824] Fix Float16 CooperativeReduction Test Failure (#2204) - Previously expected values were calculated on GPU using same dtype as result values - Now expected values are calculated on CPU using Float32 dtype - This fixes a test failure that was observed on Navi48 where difference between Eager mode (expected) and Inductor / Triton (result) did not meet the error tolerance when sum was evaluated on an array of Float16 values Co-authored-by: pnikolic-amd (cherry picked from commit 8fe3cdd5a0a8529b07ccb0c4735b013aaa2828ca) (cherry picked from commit 34f3b3e005f1b16011b1bc9642ad0ca054dae619) --- test/inductor/test_cooperative_reductions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py index 469ceec2e1b2b..a913ffb25bf33 100644 --- a/test/inductor/test_cooperative_reductions.py +++ b/test/inductor/test_cooperative_reductions.py @@ -58,7 +58,8 @@ def setUp(self): torch._dynamo.reset() def run_and_check(self, fn, args, *, expect_kernel_count=1): - expected = fn(*args) + args_cpu = [tensor.cpu().to(torch.float32) for tensor in args] + expected = fn(*args_cpu).to(torch.float16) fn = torch.compile(fn, fullgraph=True) result, (source_code,) = run_and_get_code(fn, *args) self.assertEqual(result, expected) From 01857c6576812b96148fae55655f034c60941112 Mon Sep 17 00:00:00 2001 From: Jerry Mannil <65309407+jerrymannil@users.noreply.github.com> Date: Tue, 10 Jun 2025 10:55:55 -0700 Subject: [PATCH 26/33] [ROCm] Set thread_work_size to 16 for vectorized elementwise kernels (#2259) * thread_work_size of 16 is giving better perf with many workloads (cherry picked from commit 7edf50cab4e34f2e0acdb35abf40c7f3967a3426) --- aten/src/ATen/native/cuda/CUDALoops.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index 9b104a7966363..f96b8d687bc86 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -226,8 +226,9 @@ C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) { using traits = function_traits; constexpr auto io_size = calc_io_size(); -#ifdef __gfx942__ - constexpr int tws = (io_size >= 2) ? 8 : 16; +#if defined(USE_ROCM) && defined(__gfx942__) + // Similar check in launch_vectorized_kernel() as well. Both should be in sync. + constexpr int tws = 16; #else constexpr int tws = elems_per_thread(); #endif @@ -296,7 +297,7 @@ static inline void launch_vectorized_kernel( int vec_size = memory::can_vectorize_up_to(data); c10::DeviceIndex curDevice = -1; AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice)); - int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread(); + int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread(); #else using cpp_type = typename function_traits::result_type; const uint16_t max_vec_size = memory::can_vectorize_up_to(data); From e60c0c46add9d4d12c6e375a2c7874eb634c39c8 Mon Sep 17 00:00:00 2001 From: Sampsa Riikonen Date: Thu, 19 Jun 2025 17:05:56 +0300 Subject: [PATCH 27/33] [release/2.7] Fix SDPA skip logic (#2281) fixes https://ontrack-internal.amd.com/browse/SWDEV-522391 for PT 2.7 (cherry picked from commit df38cca03ece6d06485f5cebc26db909fd258246) --- test/inductor/test_aot_inductor.py | 20 +++++++++++++++----- test/inductor/test_torchinductor.py | 3 +++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py index 6a768a3dbbb15..32a36653b225a 100644 --- a/test/inductor/test_aot_inductor.py +++ b/test/inductor/test_aot_inductor.py @@ -32,9 +32,10 @@ from torch.testing import FileCheck from torch.testing._internal import common_utils from torch.testing._internal.common_cuda import ( + PLATFORM_SUPPORTS_FLASH_ATTENTION, + PLATFORM_SUPPORTS_FP8, + PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, SM80OrLater, - SM90OrLater, - PLATFORM_SUPPORTS_FLASH_ATTENTION ) from torch.testing._internal.common_device_type import ( _has_sufficient_memory, @@ -1367,7 +1368,12 @@ def forward(self, q, k, v): self.check_model(Model(), example_inputs) @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+") - @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") + @unittest.skipIf( + # for archs where this isn't lowered to flash attention, the math + # backend will be used and it doesn't work for bfloat16 + not PLATFORM_SUPPORTS_FLASH_ATTENTION, + "Some archs don't support SDPA with bfloat16", + ) def test_sdpa_2(self): class Model(torch.nn.Module): def __init__(self) -> None: @@ -1620,7 +1626,9 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst): self.check_model(Repro(), example_inputs, dynamic_shapes=spec) @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet") - @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") + @unittest.skipIf( + not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support flash SDPA" + ) def test_fallback_kernel_with_symexpr_output(self): if self.device != GPU_TYPE: raise unittest.SkipTest("requires GPU") @@ -4179,7 +4187,9 @@ def grid(meta): dynamic_shapes=dynamic_shapes, ) - @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") + @unittest.skipIf( + not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA" + ) def test_scaled_dot_product_efficient_attention(self): if self.device != GPU_TYPE: raise unittest.SkipTest("requires GPU") diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index 20ce486f6faab..ee18c3eb508f0 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -11538,6 +11538,9 @@ def fn(q, k, v): @xfail_if_mps_unimplemented @expectedFailureXPU + @unittest.skipIf( + not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA" + ) def test_scaled_dot_product_efficient_attention(self): if self.device == "cpu": raise unittest.SkipTest(f"requires {GPU_TYPE}") From 01eaee8f31327832cc4da7d44f1cf0ac2acb41ce Mon Sep 17 00:00:00 2001 From: Ethan Wee Date: Tue, 1 Jul 2025 13:27:44 -0700 Subject: [PATCH 28/33] [release/2.7] Update test_binary_ufuncs.py after numpy upgrade (#2289) Related to https://github.com/ROCm/pytorch/commit/c7a1e32fbcf9e0a458d959a453de65c27c51452c Fixes https://ontrack-internal.amd.com/browse/SWDEV-537835 Not a Navi specific failure: ``` File "/opt/conda/envs/py_3.12/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 1412, in only_fn return fn(slf, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/lib/jenkins/pytorch/test/test_binary_ufuncs.py", line 1671, in test_cuda_tensor_pow_scalar_tensor self._test_pow(base, exp) File "/var/lib/jenkins/pytorch/test/test_binary_ufuncs.py", line 1482, in _test_pow self.assertEqual(actual, expected) File "/opt/conda/envs/py_3.12/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 4052, in assertEqual raise error_metas.pop()[0].to_error( AssertionError: The values for attribute 'dtype' do not match: torch.float32 != torch.float64. ``` Using .to(actual) without specifying dtype/device assumes actual is a tensor or tensor-like, which may fail silently or promote. Fixed by explicitly matching dtype and device. Going from https://github.com/pytorch/pytorch/issues/107302 Fix: ``` root@ubb4-rack-22:/var/lib/jenkins/pytorch# TEST_CONFIG=default HIP_VISIBLE_DEVICES=0 PYTORCH_TEST_WITH_ROCM=1 python test/test_binary_ufuncs.py TestBinaryUfuncsCUDA.test_cuda_tensor_pow_scalar_tensor_cuda /opt/conda/envs/py_3.12/lib/python3.12/site-packages/hypothesis/entry_points.py:23: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. import pkg_resources Running tests... ---------------------------------------------------------------------- . ---------------------------------------------------------------------- Ran 1 test in 0.141s OK Generating XML reports... root@ubb4-rack-22:/var/lib/jenkins/pytorch# pip list | grep numpy numpy 2.1.2 ``` (cherry picked from commit a4d60fa319fc275d138abba367528d0a45243ea7) --- test/test_binary_ufuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index bdc0d7329df59..b3f81f1332863 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -1447,7 +1447,7 @@ def to_np(value): try: np_res = np.power(to_np(base), to_np(np_exponent)) expected = ( - torch.from_numpy(np_res) + torch.from_numpy(np_res).to(dtype=base.dtype) if isinstance(np_res, np.ndarray) else torch.tensor(np_res, dtype=base.dtype) ) From 80e89740c17525e4065a2160f084664d1caca3f8 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Tue, 8 Jul 2025 01:07:52 -0500 Subject: [PATCH 29/33] [AUTOGENERATED] [release/2.7] fix jit_utils.cpp (#2320) Cherry-pick of https://github.com/ROCm/pytorch/pull/2319 Co-authored-by: Jeff Daily (cherry picked from commit e725e2e7c51be0a0b022e4404bfc69777e2ef010) --- aten/src/ATen/native/cuda/jit_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index 19d0447369914..cd1441a14b1e9 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -45,7 +45,7 @@ namespace at::cuda::jit { // Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above. // If not compiling for ROCm, return the original get_traits_string(). std::string get_traits_string_but_hiprtc_safe() { -#ifdef USE_ROCM +#if defined(USE_ROCM) && ROCM_VERSION < 70000 return R"ESCAPE( namespace std { From bb44c0cc737ba8aa4978f57ff2763a729af1cd6a Mon Sep 17 00:00:00 2001 From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com> Date: Mon, 14 Jul 2025 12:23:45 -0500 Subject: [PATCH 30/33] Clean up CUDA state between tests (#2335) This PR fixes the unit test, test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction FAILED [0.1163s] ``` Traceback (most recent call last): File "/var/lib/jenkins/pytorch/test/test_cuda.py", line 471, in test_set_per_process_memory_fraction tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda") RuntimeError: Trying to create tensor with negative dimension -5681285432: [-5681285432] ``` This error occurs only on gfx1101 arch. This error is coming from an integer overflow when another unit test, test/test_cuda.py::TestCuda::test_randint_generation_for_large_numel creates a tensor with a huge numel, which overflows into a higher torch.cuda.max_memory_reserved() when you call test/test_cuda.py::TestCuda::test_set_per_process_memory_fraction afterward. To avoid this we introduced torch.cuda.empty_cache() and torch.cuda.reset_peak_memory_stats() to clean up CUDA states. JIRA: https://ontrack-internal.amd.com/browse/SWDEV-535295 (cherry picked from commit f86d18439897232a374504c36b40da99c14ade1a) --- test/test_cuda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_cuda.py b/test/test_cuda.py index 5e5adcf56a60e..35e834d3b62ee 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -467,6 +467,9 @@ def test_out_of_memory_retry(self): IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support" ) def test_set_per_process_memory_fraction(self): + if torch.version.hip and ('gfx1101' in torch.cuda.get_device_properties(0).gcnArchName): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() orig = torch.cuda.get_per_process_memory_fraction(0) torch.cuda.reset_peak_memory_stats(0) try: From 1f312c4ff01cfebbf8da9d623e8183183d266c09 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Sat, 28 Jun 2025 22:38:43 +0000 Subject: [PATCH 31/33] cublaslt/hipblaslt persistent workspace (#156495) Similar to cublas/hipblas, LT now allocates one workspace per handle+stream combo. - fixes hipblaslt issue where memory use increased during graph capture - preserves CUDA env var TORCH_CUBLASLT_UNIFIED_WORKSPACE - moves LT workspace and size from CUDABlas.cpp into CublasHandlePool.cpp, new APIs - size_t getCUDABlasLtWorkspaceSize() - void* getCUDABlasLtWorkspace() Fixes https://github.com/ROCm/pytorch/issues/2286. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156495 Approved by: https://github.com/eqy (cherry picked from commit 996206e66fcafff25d0af5177497e8f792000869) --- aten/src/ATen/cuda/CUDABlas.cpp | 85 ++------------------ aten/src/ATen/cuda/CUDAContextLight.h | 3 + aten/src/ATen/cuda/CublasHandlePool.cpp | 91 ++++++++++++++++++++++ aten/src/ATen/cuda/tunable/GemmHipblaslt.h | 32 +------- 4 files changed, 102 insertions(+), 109 deletions(-) diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index d884fd7ffa915..573389bcd7c70 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -188,82 +188,11 @@ uint32_t _getAlignment(uintptr_t address) { } #endif -static size_t _parseChosenWorkspaceSize() { - auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE"); -#ifdef USE_ROCM - if (!val.has_value()) { - // accept either env var - val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE"); - } - size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */ -#else - size_t workspace_size = 1024; /* default size in KiB according to #73328 */ -#endif - - if (val.has_value()) { - try { - workspace_size = std::stoi(val.value()); - } catch (std::invalid_argument const&) { - TORCH_WARN( - "invalid CUBLASLT_WORKSPACE_SIZE,", - " using default workspace size of ", - workspace_size, - " KiB."); - } catch (std::out_of_range const&) { - TORCH_WARN( - "CUBLASLT_WORKSPACE_SIZE out of range,", - " using default workspace size of ", - workspace_size, - " KiB."); - } - } - return workspace_size * 1024; -} - -static size_t _getWorkspaceSize() { - static size_t workspace_size = _parseChosenWorkspaceSize(); - return workspace_size; -} - -void* _getUnifiedWorkspaceWithoutHandle() { - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); - auto stream = c10::cuda::getCurrentCUDAStream(); - cudaStream_t _stream = stream; - auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); - auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key); - TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end()); - return workspace_it->second.mutable_get(); -} - struct CublasLtWorkspace { CublasLtWorkspace() { - size = _getWorkspaceSize(); -#ifndef USE_ROCM - static bool unified = c10::utils::check_env("TORCH_CUBLASLT_UNIFIED_WORKSPACE") == true; - if (unified) { - auto cublasWorkspaceSize = at::cuda::getChosenWorkspaceSize(); - if (cublasWorkspaceSize < size) { - TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", size, - " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize, - " bytes. Please increase CUBLAS workspace size", - " via CUBLAS_WORKSPACE_CONFIG or decrease requested" - " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace" - " size will be limited to the CUBLAS workspace size."); - size = cublasWorkspaceSize; - } - ptr = _getUnifiedWorkspaceWithoutHandle(); - } else { - auto allocator = c10::cuda::CUDACachingAllocator::get(); - stashed_ptr_ = allocator->allocate(size); - ptr = stashed_ptr_.mutable_get(); - } -#else - auto allocator = c10::cuda::CUDACachingAllocator::get(); - stashed_ptr_ = allocator->allocate(size); - ptr = stashed_ptr_.mutable_get(); -#endif + size = at::cuda::getCUDABlasLtWorkspaceSize(); + ptr = at::cuda::getCUDABlasLtWorkspace(); } - at::DataPtr stashed_ptr_; void * ptr; size_t size; }; @@ -2111,10 +2040,8 @@ void int8_gemm( #ifdef USE_ROCM CuBlasLtMatmulPreference preference; - size_t workspaceSize = _getWorkspaceSize(); - preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize); - auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); - auto workspace = allocator.allocate(workspaceSize); + auto ltworkspace = CublasLtWorkspace(); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size); cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( @@ -2152,12 +2079,12 @@ void int8_gemm( nullptr, // Heuristics don't seem to work for int8 #endif #ifdef USE_ROCM - workspace.mutable_get(), + ltworkspace.ptr, #else nullptr, // Non-zero workspace doesn't seem to work. #endif #ifdef USE_ROCM - workspaceSize, + ltworkspace.size, #else 0, #endif diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h index 65019bb6097c9..86e960cc1ab4a 100644 --- a/aten/src/ATen/cuda/CUDAContextLight.h +++ b/aten/src/ATen/cuda/CUDAContextLight.h @@ -89,7 +89,10 @@ TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); TORCH_CUDA_CPP_API void clearCublasWorkspaces(); TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublas_handle_stream_to_workspace(); +TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace(); TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize(); +TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize(); +TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace(); #if defined(CUDART_VERSION) || defined(USE_ROCM) TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle(); diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 720304ad198e8..81159d5c0e272 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -23,6 +23,9 @@ * To work around this difference in behavior, a separate handle pool is available for ROCm builds. * For CUDA builds, getCurrentCUDABlasLtHandle will alias for getCurrentCUDABlasHandle, * whereas for ROCm builds, it is a distinct function. + * + * The workspace pools are separate for ROCm. On CUDA, the env var + * TORCH_CUBLASLT_UNIFIED_WORKSPACE can be used to opt-in to unifying the workspace pools. */ namespace at::cuda { @@ -109,8 +112,14 @@ std::map, at::DataPtr>& cublas_handle_stream_to_works return instance; } +std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace() { + static auto& instance = *new std::map, at::DataPtr>; + return instance; +} + void clearCublasWorkspaces() { cublas_handle_stream_to_workspace().clear(); + cublaslt_handle_stream_to_workspace().clear(); } size_t parseChosenWorkspaceSize() { @@ -157,15 +166,97 @@ size_t parseChosenWorkspaceSize() { } } +size_t parseCUDABlasLtWorkspaceSize() { + auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE"); +#ifdef USE_ROCM + if (!val.has_value()) { + // accept either env var + val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE"); + } + size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */ +#else + size_t workspace_size = 1024; /* default size in KiB according to #73328 */ +#endif + + if (val.has_value()) { + try { + workspace_size = std::stoi(val.value()); + } catch (std::invalid_argument const&) { + TORCH_WARN( + "invalid CUBLASLT_WORKSPACE_SIZE,", + " using default workspace size of ", + workspace_size, + " KiB."); + } catch (std::out_of_range const&) { + TORCH_WARN( + "CUBLASLT_WORKSPACE_SIZE out of range,", + " using default workspace size of ", + workspace_size, + " KiB."); + } + } + return workspace_size * 1024; +} + size_t getChosenWorkspaceSize() { size_t pool_size = parseChosenWorkspaceSize(); return pool_size; } +#define TORCH_CUBLASLT_UNIFIED_WORKSPACE "TORCH_CUBLASLT_UNIFIED_WORKSPACE" + +size_t getCUDABlasLtWorkspaceSize() { + size_t pool_size = parseCUDABlasLtWorkspaceSize(); +#ifndef USE_ROCM + static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true; + if (unified) { + auto cublasWorkspaceSize = getChosenWorkspaceSize(); + if (cublasWorkspaceSize < pool_size) { + TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", pool_size, + " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize, + " bytes. Please increase CUBLAS workspace size", + " via CUBLAS_WORKSPACE_CONFIG or decrease requested" + " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace" + " size will be limited to the CUBLAS workspace size."); + pool_size = cublasWorkspaceSize; + } + } +#endif + return pool_size; +} + at::DataPtr getNewWorkspace() { return c10::cuda::CUDACachingAllocator::get()->allocate(getChosenWorkspaceSize()); } +at::DataPtr getNewCUDABlasLtWorkspace() { + return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize()); +} + +void* getCUDABlasLtWorkspace() { +#ifndef USE_ROCM + static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true; + if (unified) { + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + auto stream = c10::cuda::getCurrentCUDAStream(); + cudaStream_t _stream = stream; + auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); + auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key); + TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end()); + return workspace_it->second.mutable_get(); + } +#endif + cublasLtHandle_t handle = getCurrentCUDABlasLtHandle(); + auto stream = c10::cuda::getCurrentCUDAStream(); + cudaStream_t _stream = stream; + auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); + auto workspace_it = cublaslt_handle_stream_to_workspace().find(key); + if (workspace_it == cublaslt_handle_stream_to_workspace().end()) { + workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()}); + } + return workspace_it->second.mutable_get(); +} + cublasHandle_t getCurrentCUDABlasHandle() { c10::DeviceIndex device = 0; AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index a23a2d720c5c4..fe6d1161d1ba9 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -381,28 +381,6 @@ static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) { return HIPBLAS_OP_T; } -static size_t GetHipblasltWorkspaceSize() { - static const auto env = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE"); - // 256MB is max workspace size allowed for hipblaslt - // hipblaslt-bench uses 32MB - // recommendation from hipblaslt author was 76MB - // TunableOp hipBLASLt workspace size is aligned with - // PyTorch's default in CUDABlas.cpp (_parseChosenWorkspaceSize) - size_t workspace_size = 76*1024; - if (env) { - try { - workspace_size = std::stoi(env.value()); - } catch(std::invalid_argument const& e) { - TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,", - " using default workspace size of ", workspace_size, " KiB."); - } catch(std::out_of_range const& e) { - TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,", - " using default workspace size of ", workspace_size, " KiB."); - } - } - return workspace_size * 1024; -} - template struct HipBlasLtDeleter { void operator()(T* x) { @@ -550,7 +528,7 @@ class HipblasltGemmOp : public Callable { } } - size_t workspace_size = GetHipblasltWorkspaceSize(); + size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize(); auto op_handle = at::cuda::getCurrentCUDABlasLtHandle(); @@ -575,10 +553,7 @@ class HipblasltGemmOp : public Callable { return FAIL; } - void* workspace_buffer = nullptr; - if (workspace_size > 0) { - workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size); - } + void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace(); TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle, matmul.descriptor(), @@ -601,9 +576,6 @@ class HipblasltGemmOp : public Callable { TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a)); TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b)); TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c)); - if (workspace_size > 0) { - c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer); - } return OK; } From 3b7f377541a9aea6548faecfd2e22cb00c78d0c0 Mon Sep 17 00:00:00 2001 From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com> Date: Tue, 15 Jul 2025 11:54:34 -0500 Subject: [PATCH 32/33] [AUTOGENERATED] [release/2.7] [release/2.6] Fix dtype before comparing torch and numpy tensors (#2362) Cherry-pick of https://github.com/ROCm/pytorch/pull/2340 Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> (cherry picked from commit 22c98ea3175f5520a7724ada85a6d6ed482b4785) --- test/test_binary_ufuncs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index b3f81f1332863..7772134fd1534 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -1447,7 +1447,7 @@ def to_np(value): try: np_res = np.power(to_np(base), to_np(np_exponent)) expected = ( - torch.from_numpy(np_res).to(dtype=base.dtype) + torch.from_numpy(np_res) if isinstance(np_res, np.ndarray) else torch.tensor(np_res, dtype=base.dtype) ) @@ -1480,8 +1480,8 @@ def to_np(value): self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent) elif torch.can_cast(torch.result_type(base, exponent), base.dtype): actual2 = actual.pow_(exponent) - self.assertEqual(actual, expected) - self.assertEqual(actual2, expected) + self.assertEqual(actual, expected.to(actual)) + self.assertEqual(actual2, expected.to(actual)) else: self.assertRaisesRegex( RuntimeError, From 8b2361443564d9d9753124fe9e39e3d8d187e114 Mon Sep 17 00:00:00 2001 From: tvukovic-amd Date: Fri, 27 Jun 2025 07:23:28 +0000 Subject: [PATCH 33/33] [ROCm][Windows] Fixing undefined symbol linker error after exposing MIOpen symbols (#156479) Fixing undefined symbol linker error after [exposing MIOpen symbols](https://github.com/pytorch/pytorch/pull/154545). This fix: - Hipifies `aten/src/ATen/miopen` and `aten/src/ATen/native/miopen` files - Adds `aten/src/ATen/miopen` and `aten/src/ATen/native/miopen` hipified source files to `all_hip_cpp` list Pull Request resolved: https://github.com/pytorch/pytorch/pull/156479 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily --- aten/src/ATen/miopen/Descriptors.h | 12 ++++++------ aten/src/ATen/miopen/Handle.h | 2 +- aten/src/ATen/miopen/Types.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h index a0ad4a4e1098a..2eee837cd533d 100644 --- a/aten/src/ATen/miopen/Descriptors.h +++ b/aten/src/ATen/miopen/Descriptors.h @@ -39,7 +39,7 @@ struct DescriptorDeleter { // function. template // NOLINTNEXTLINE(bugprone-exception-escape) -class TORCH_CUDA_CPP_API Descriptor { +class TORCH_HIP_CPP_API Descriptor { public: // Use desc() to access the underlying descriptor pointer in // a read-only fashion. Most client code should use this. @@ -65,7 +65,7 @@ class TORCH_CUDA_CPP_API Descriptor { std::unique_ptr> desc_; }; -class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor< +class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor< miopenTensorDescriptor, &miopenCreateTensorDescriptor, &miopenDestroyTensorDescriptor> { @@ -88,7 +88,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor< std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); -class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor< +class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor< miopenTensorDescriptor, &miopenCreateTensorDescriptor, &miopenDestroyTensorDescriptor> { @@ -105,7 +105,7 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor< } }; -struct TORCH_CUDA_CPP_API ConvolutionDescriptor +struct TORCH_HIP_CPP_API ConvolutionDescriptor : public Descriptor< miopenConvolutionDescriptor, &miopenCreateConvolutionDescriptor, @@ -121,7 +121,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor }; // NOLINTNEXTLINE(bugprone-exception-escape) -struct TORCH_CUDA_CPP_API DropoutDescriptor +struct TORCH_HIP_CPP_API DropoutDescriptor : public Descriptor< miopenDropoutDescriptor, &miopenCreateDropoutDescriptor, @@ -137,7 +137,7 @@ struct TORCH_CUDA_CPP_API DropoutDescriptor } }; -struct TORCH_CUDA_CPP_API RNNDescriptor +struct TORCH_HIP_CPP_API RNNDescriptor : public Descriptor diff --git a/aten/src/ATen/miopen/Handle.h b/aten/src/ATen/miopen/Handle.h index 4c80c3aea65bf..b1637fca0a582 100644 --- a/aten/src/ATen/miopen/Handle.h +++ b/aten/src/ATen/miopen/Handle.h @@ -5,5 +5,5 @@ namespace at::native { -TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle(); +TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle(); } // namespace at::native diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h index 0a8a1a952e2e2..fdc0f6a607b71 100644 --- a/aten/src/ATen/miopen/Types.h +++ b/aten/src/ATen/miopen/Types.h @@ -6,7 +6,7 @@ namespace at::native { -TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor); +TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor); int64_t miopen_version();