From 9d2baa71c900d73cba5d2b91a05f99c11cd3e003 Mon Sep 17 00:00:00 2001 From: BL <110066325+BLOrange-AMD@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:15:01 -0500 Subject: [PATCH 1/4] Updated fbgemm.txt --- .github/ci_commit_pins/fbgemm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/fbgemm.txt b/.github/ci_commit_pins/fbgemm.txt index 169c1a360d13..ed08064bc9d5 100644 --- a/.github/ci_commit_pins/fbgemm.txt +++ b/.github/ci_commit_pins/fbgemm.txt @@ -1 +1 @@ -de731af65b4f04696e85c729e3282450b51b95fd +3d641a1ca1e58bf068306b987d3b371b9deddf77 From 068036aef1170c4ece037b1f2dc7831bdf98f833 Mon Sep 17 00:00:00 2001 From: BL <110066325+BLOrange-AMD@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:40:36 -0500 Subject: [PATCH 2/4] Set the same commit with upstream fix --- .github/ci_commit_pins/fbgemm_rocm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/fbgemm_rocm.txt b/.github/ci_commit_pins/fbgemm_rocm.txt index fa11e10ca6b8..db140a31f3fa 100644 --- a/.github/ci_commit_pins/fbgemm_rocm.txt +++ b/.github/ci_commit_pins/fbgemm_rocm.txt @@ -1 +1 @@ -5fb5024118e9bb9decf96c2b0b1a8f0010bf56be +7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8 From d50c190f7dc2cdb3e4efff5f0b0c2e0c337115d8 Mon Sep 17 00:00:00 2001 From: BL <110066325+BLOrange-AMD@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:48:50 -0500 Subject: [PATCH 3/4] Reverted fbgemm.txt changes --- .github/ci_commit_pins/fbgemm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/fbgemm.txt b/.github/ci_commit_pins/fbgemm.txt index ed08064bc9d5..169c1a360d13 100644 --- a/.github/ci_commit_pins/fbgemm.txt +++ b/.github/ci_commit_pins/fbgemm.txt @@ -1 +1 @@ -3d641a1ca1e58bf068306b987d3b371b9deddf77 +de731af65b4f04696e85c729e3282450b51b95fd From e401aa77e04f8f2a820319975691ef6eb89fd56d Mon Sep 17 00:00:00 2001 From: BL <110066325+BLOrange-AMD@users.noreply.github.com> Date: Thu, 31 Jul 2025 23:48:08 -0500 Subject: [PATCH 4/4] Build domain libraries on the build job --- .ci/pytorch/common_utils.sh | 102 +++++++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 13 deletions(-) diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 4f8439bd832d..8f2c45aa4ffa 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -78,6 +78,34 @@ function pip_install_whl() { fi } +function pip_build_and_install() { + local build_target=$1 + local wheel_dir=$2 + + local found_whl=0 + for file in "${wheel_dir}"/*.whl + do + if [[ -f "${file}" ]]; then + found_whl=1 + break + fi + done + + # Build the wheel if it doesn't exist + if [ "${found_whl}" == "0" ]; then + python3 -m pip wheel \ + --no-build-isolation \ + --no-deps \ + --no-use-pep517 \ + -w "${wheel_dir}" \ + "${build_target}" + fi + + for file in "${wheel_dir}"/*.whl + do + pip_install_whl "${file}" + done +} function pip_install() { # retry 3 times @@ -174,25 +202,73 @@ function install_torchrec_and_fbgemm() { if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm - pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" + pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec pip_uninstall fbgemm-gpu-nightly + # Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm + ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}" + + # Find rocm_version.h header file for ROCm version extract + rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h" + if [ ! -f "$rocm_version_h" ]; then + rocm_version_h="${ROCM_HOME}/include/rocm_version.h" + fi + + # Error out if rocm_version.h not found + if [ ! -f "$rocm_version_h" ]; then + echo "Error: rocm_version.h not found in expected locations." >&2 + exit 1 + fi + + # Extract major, minor and patch ROCm version numbers + MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}') + MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}') + PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}') + ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION)) + echo "ROCm version: $ROCM_INT" + export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION" + pip_install tabulate # needed for newer fbgemm pip_install patchelf # needed for rocm fbgemm - git clone --recursive https://github.com/pytorch/fbgemm - pushd fbgemm/fbgemm_gpu - git checkout "${fbgemm_commit}" - python setup.py install \ - --package_variant=rocm \ - -DHIP_ROOT_DIR="${ROCM_PATH}" \ - -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ - -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" - popd + pushd /tmp + + local wheel_dir=dist/fbgemm_gpu + local found_whl=0 + for file in "${wheel_dir}"/*.whl + do + if [[ -f "${file}" ]]; then + found_whl=1 + break + fi + done + + # Build the wheel if it doesn't exist + if [ "${found_whl}" == "0" ]; then + git clone --recursive https://github.com/pytorch/fbgemm + pushd fbgemm/fbgemm_gpu + git checkout "${fbgemm_commit}" + python setup.py bdist_wheel \ + --build-variant=rocm \ + -DHIP_ROOT_DIR="${ROCM_PATH}" \ + -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ + -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" + popd + + # Save the wheel before cleaning up + mkdir -p dist/fbgemm_gpu + cp fbgemm/fbgemm_gpu/dist/*.whl dist/fbgemm_gpu + fi + + for file in "${wheel_dir}"/*.whl + do + pip_install_whl "${file}" + done + rm -rf fbgemm + popd else - # See https://github.com/pytorch/pytorch/issues/106971 - CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu" - pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" + pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec + pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu fi }