From 9dd6db1ce6aded757b3074a1d3f01cb765e824f4 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Fri, 19 Sep 2025 17:11:45 +0000 Subject: [PATCH 1/4] update gcc --- scripts/build_torch_wheels.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/build_torch_wheels.sh b/scripts/build_torch_wheels.sh index 0f30a1e4e62..e773b5c0449 100755 --- a/scripts/build_torch_wheels.sh +++ b/scripts/build_torch_wheels.sh @@ -119,9 +119,9 @@ function install_llvm_clang() { function install_gcc() { sudo apt-get -y install gcc-11 g++-11 - export CC=/usr/bin/gcc-10 export CXX=/usr/bin/g++-11 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100 + export CC=/usr/bin/gcc-11 export CXX=/usr/bin/g++-11 + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 } function install_req_packages() { From 84bfe5a419836f6fdcb39fb964440d4bab368c34 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Sun, 21 Sep 2025 01:49:12 +0000 Subject: [PATCH 2/4] Use download-artifact@v5 --- .github/workflows/_build_torch_xla.yml | 153 ++++++----- .github/workflows/_docs.yml | 134 +++++----- .github/workflows/_test.yml | 342 ++++++++++++------------- .github/workflows/_torchprime_ci.yml | 222 ++++++++-------- .github/workflows/setup/action.yml | 98 +++---- 5 files changed, 474 insertions(+), 475 deletions(-) diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml index 34b0df460c8..aaff9c2bf9d 100644 --- a/.github/workflows/_build_torch_xla.yml +++ b/.github/workflows/_build_torch_xla.yml @@ -1,80 +1,79 @@ name: build-torch-xla on: - workflow_call: - inputs: - dev-image: - required: true - type: string - description: Base image for builds - torch-commit: - required: true - type: string - description: torch-commit - runner: - required: false - type: string - description: Runner type for the test - default: linux.12xlarge - timeout-minutes: - required: false - type: number - description: Timeout in minutes for the build job - default: 45 # Takes ~20m as of 2025/5/30. - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' - secrets: - gcloud-service-key: - required: true - description: Secret to access Bazel build cache + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + torch-commit: + required: true + type: string + description: torch-commit + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge + timeout-minutes: + required: false + type: number + description: Timeout in minutes for the build job + default: 45 # Takes ~20m as of 2025/5/30. + has_code_changes: + required: false + type: string + description: Whether to run full workflow or not + default: "true" + secrets: + gcloud-service-key: + required: true + description: Secret to access Bazel build cache jobs: - build: - runs-on: ${{ inputs.runner }} - timeout-minutes: ${{ inputs.timeout-minutes }} - container: - image: ${{ inputs.dev-image }} - env: - GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} - BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - BAZEL_JOBS: '' # Let bazel decide the parallelism based on the number of CPUs. - BUILD_CPP_TESTS: 1 - steps: - # Need to check out local composite actions before using them - # https://github.com/orgs/community/discussions/11771 - - name: Checkout actions - if: inputs.has_code_changes == 'true' - uses: actions/checkout@v4 - with: - sparse-checkout: | - .github/workflows/setup - path: .actions - - name: Setup - if: inputs.has_code_changes == 'true' - uses: ./.actions/.github/workflows/setup - with: - torch-commit: ${{ inputs.torch-commit }} - - name: Build - if: inputs.has_code_changes == 'true' - shell: bash - run: | - cd pytorch/xla/infra/ansible - ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - - name: Upload wheel - if: inputs.has_code_changes == 'true' - uses: actions/upload-artifact@v4 - with: - name: torch-xla-wheels - path: /dist/*.whl - - name: Upload CPP test binaries - if: inputs.has_code_changes == 'true' - uses: actions/upload-artifact@v4 - with: - name: cpp-test-bin - path: /tmp/test/bin - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." - + build: + runs-on: ${{ inputs.runner }} + timeout-minutes: ${{ inputs.timeout-minutes }} + container: + image: ${{ inputs.dev-image }} + env: + GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} + BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + BAZEL_JOBS: "" # Let bazel decide the parallelism based on the number of CPUs. + BUILD_CPP_TESTS: 1 + steps: + # Need to check out local composite actions before using them + # https://github.com/orgs/community/discussions/11771 + - name: Checkout actions + if: inputs.has_code_changes == 'true' + uses: actions/checkout@v4 + with: + sparse-checkout: | + .github/workflows/setup + path: .actions + - name: Setup + if: inputs.has_code_changes == 'true' + uses: ./.actions/.github/workflows/setup + with: + torch-commit: ${{ inputs.torch-commit }} + - name: Build + if: inputs.has_code_changes == 'true' + shell: bash + run: | + cd pytorch/xla/infra/ansible + ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps + - name: Upload wheel + if: inputs.has_code_changes == 'true' + uses: actions/upload-artifact@v4 + with: + name: torch-xla-wheels + path: /dist/*.whl + - name: Upload CPP test binaries + if: inputs.has_code_changes == 'true' + uses: actions/upload-artifact@v4 + with: + name: cpp-test-bin + path: /tmp/test/bin + - name: Report no code changes + if: inputs.has_code_changes == 'false' + run: | + echo "No code changes were detected that require running the full test suite." diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index ee2650834e8..f35e0adbca6 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -1,70 +1,70 @@ name: xla-docs-build on: - workflow_call: - inputs: - dev-image: - required: true - type: string - description: Base image for builds - runner: - required: false - type: string - description: Runner type for the test - default: linux.4xlarge - secrets: - torchxla-bot-token: - required: true + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + runner: + required: false + type: string + description: Runner type for the test + default: linux.4xlarge + secrets: + torchxla-bot-token: + required: true jobs: - build-docs: - runs-on: ubuntu-24.04 - timeout-minutes: 45 - container: - image: ${{ inputs.dev-image }} - env: - BRANCH_NAME: ${{ github.ref_name }} - steps: - - name: Fetch wheels - uses: actions/download-artifact@v4 - with: - name: torch-xla-wheels - path: /tmp/wheels/ - - name: Install wheels - shell: bash - run: | - pip install /tmp/wheels/*.whl - - name: Checkout PyTorch/XLA Repo - uses: actions/checkout@v4 - with: - path: pytorch/xla - - name: Build docs - shell: bash - run: | - cd pytorch/xla/docs - pip install -r requirements.txt - sphinx-build -b html source build - - name: Checkout GitHub Pages - uses: actions/checkout@v4 - with: - path: gh-pages - ref: gh-pages - token: ${{ github.event_name == 'push' && secrets.torchxla-bot-token || github.token }} - - name: Merge changes - shell: bash - run: | - subdir=${{ env.BRANCH_NAME == 'master' && 'master' || format('{0}/{1}', 'release', env.BRANCH_NAME) }} - mkdir -p gh-pages/$subdir - cp -fR pytorch/xla/docs/build/* gh-pages/$subdir - - name: Upload preview as artifact - uses: actions/upload-artifact@v4 - with: - name: github-pages - path: pytorch/xla/docs/build/ - - name: Deploy - shell: bash - run: | - cd gh-pages - git config user.email "pytorchxla@gmail.com" - git config user.name "torchxlabot2" - git add . -v - git diff --cached --exit-code || git commit -m "Update doc from commit ${{ github.sha }}" - git push origin gh-pages + build-docs: + runs-on: ubuntu-24.04 + timeout-minutes: 45 + container: + image: ${{ inputs.dev-image }} + env: + BRANCH_NAME: ${{ github.ref_name }} + steps: + - name: Fetch wheels + uses: actions/download-artifact@v5 + with: + name: torch-xla-wheels + path: /tmp/wheels/ + - name: Install wheels + shell: bash + run: | + pip install /tmp/wheels/*.whl + - name: Checkout PyTorch/XLA Repo + uses: actions/checkout@v4 + with: + path: pytorch/xla + - name: Build docs + shell: bash + run: | + cd pytorch/xla/docs + pip install -r requirements.txt + sphinx-build -b html source build + - name: Checkout GitHub Pages + uses: actions/checkout@v4 + with: + path: gh-pages + ref: gh-pages + token: ${{ github.event_name == 'push' && secrets.torchxla-bot-token || github.token }} + - name: Merge changes + shell: bash + run: | + subdir=${{ env.BRANCH_NAME == 'master' && 'master' || format('{0}/{1}', 'release', env.BRANCH_NAME) }} + mkdir -p gh-pages/$subdir + cp -fR pytorch/xla/docs/build/* gh-pages/$subdir + - name: Upload preview as artifact + uses: actions/upload-artifact@v4 + with: + name: github-pages + path: pytorch/xla/docs/build/ + - name: Deploy + shell: bash + run: | + cd gh-pages + git config user.email "pytorchxla@gmail.com" + git config user.name "torchxlabot2" + git add . -v + git diff --cached --exit-code || git commit -m "Update doc from commit ${{ github.sha }}" + git push origin gh-pages diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index 6c2175117e5..10bc92327de 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -1,180 +1,180 @@ name: xla-test on: - workflow_call: - inputs: - dev-image: - required: true - type: string - description: Base image for builds - runner: - required: false - type: string - description: Runner type for the test - default: linux.12xlarge - collect-coverage: - required: false - type: boolean - description: Set to true to collect coverage information - default: false - timeout-minutes: - required: false - type: number - default: 180 # Takes ~105m as of 2025/5/30. - description: | - Set the maximum (in minutes) how long the workflow should take to finish + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge + collect-coverage: + required: false + type: boolean + description: Set to true to collect coverage information + default: false timeout-minutes: - torch-commit: - required: true - type: string - description: torch-commit - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' - secrets: - gcloud-service-key: - required: true - description: Secret to access Bazel build cache + required: false + type: number + default: 180 # Takes ~105m as of 2025/5/30. + description: | + Set the maximum (in minutes) how long the workflow should take to finish + timeout-minutes: + torch-commit: + required: true + type: string + description: torch-commit + has_code_changes: + required: false + type: string + description: Whether to run full workflow or not + default: "true" + secrets: + gcloud-service-key: + required: true + description: Secret to access Bazel build cache jobs: - test: - runs-on: ${{ inputs.runner }} - container: - image: ${{ inputs.dev-image }} - options: "--shm-size 16g" - strategy: - fail-fast: false - matrix: - include: - # Use readable strings as they define the workflow titles. - - run_benchmark_tests: 'benchmark_tests' - - run_python_tests: 'python_tests' - run_xla_op_tests1: 'xla_op1' - - run_python_tests: 'python_tests' - run_xla_op_tests2: 'xla_op2' - - run_python_tests: 'python_tests' - run_xla_op_tests3: 'xla_op3' - - run_python_tests: 'python_tests' - run_xla_op_tests4: 'xla_op4' - - run_python_tests: 'python_tests' - run_xla_op_tests5: 'xla_op5' - - run_python_tests: 'python_tests' - run_torch_mp_op_tests: 'torch_mp_op' - - run_cpp_tests: 'cpp_tests' - timeout-minutes: ${{ inputs.timeout-minutes }} - env: - GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} - GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json - USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} - RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }} - RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }} - RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }} - RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }} - RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }} - RUN_XLA_OP_TESTS4: ${{ matrix.run_xla_op_tests4 }} - RUN_XLA_OP_TESTS5: ${{ matrix.run_xla_op_tests5 }} - RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }} - RUN_CPP_TESTS: ${{ matrix.run_cpp_tests }} - BAZEL_JOBS: '' # Let bazel decide the parallelism based on the number of CPUs. - BAZEL_REMOTE_CACHE: 1 - steps: - - name: Checkout actions - if: inputs.has_code_changes == 'true' - uses: actions/checkout@v4 - with: - sparse-checkout: | - .github/workflows/setup - path: .actions - - name: Setup - if: inputs.has_code_changes == 'true' - uses: ./.actions/.github/workflows/setup - with: - torch-commit: ${{ inputs.torch-commit }} - wheels-artifact: torch-xla-wheels - - name: Fetch CPP test binaries - if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests - uses: actions/download-artifact@v4 - with: - name: cpp-test-bin - path: /tmp/test/bin - # GitHub Actions doesn't preserve executable permissions - # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss - - name: Set CPP test permissions - if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests - run: | - chmod +x /tmp/test/bin/* - ls -l /tmp/test/bin - - name: Install test deps - if: inputs.has_code_changes == 'true' - shell: bash - run: | - # TODO: Add these in setup.py - pip install fsspec - pip install rich - pip install flax - - name: Checkout PyTorch Repo - if: inputs.has_code_changes == 'true' - uses: actions/checkout@v4 - with: - repository: pytorch/pytorch - path: pytorch - ref: ${{ inputs.torch-commit }} - - name: Checkout PyTorch/XLA Repo - if: inputs.has_code_changes == 'true' - uses: actions/checkout@v4 - with: - path: pytorch/xla - - name: Extra CI deps - if: inputs.has_code_changes == 'true' - shell: bash - run: | - set -x + test: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.dev-image }} + options: "--shm-size 16g" + strategy: + fail-fast: false + matrix: + include: + # Use readable strings as they define the workflow titles. + - run_benchmark_tests: "benchmark_tests" + - run_python_tests: "python_tests" + run_xla_op_tests1: "xla_op1" + - run_python_tests: "python_tests" + run_xla_op_tests2: "xla_op2" + - run_python_tests: "python_tests" + run_xla_op_tests3: "xla_op3" + - run_python_tests: "python_tests" + run_xla_op_tests4: "xla_op4" + - run_python_tests: "python_tests" + run_xla_op_tests5: "xla_op5" + - run_python_tests: "python_tests" + run_torch_mp_op_tests: "torch_mp_op" + - run_cpp_tests: "cpp_tests" + timeout-minutes: ${{ inputs.timeout-minutes }} + env: + GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} + GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json + USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} + RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }} + RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }} + RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }} + RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }} + RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }} + RUN_XLA_OP_TESTS4: ${{ matrix.run_xla_op_tests4 }} + RUN_XLA_OP_TESTS5: ${{ matrix.run_xla_op_tests5 }} + RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }} + RUN_CPP_TESTS: ${{ matrix.run_cpp_tests }} + BAZEL_JOBS: "" # Let bazel decide the parallelism based on the number of CPUs. + BAZEL_REMOTE_CACHE: 1 + steps: + - name: Checkout actions + if: inputs.has_code_changes == 'true' + uses: actions/checkout@v4 + with: + sparse-checkout: | + .github/workflows/setup + path: .actions + - name: Setup + if: inputs.has_code_changes == 'true' + uses: ./.actions/.github/workflows/setup + with: + torch-commit: ${{ inputs.torch-commit }} + wheels-artifact: torch-xla-wheels + - name: Fetch CPP test binaries + if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests + uses: actions/download-artifact@v5 + with: + name: cpp-test-bin + path: /tmp/test/bin + # GitHub Actions doesn't preserve executable permissions + # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss + - name: Set CPP test permissions + if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests + run: | + chmod +x /tmp/test/bin/* + ls -l /tmp/test/bin + - name: Install test deps + if: inputs.has_code_changes == 'true' + shell: bash + run: | + # TODO: Add these in setup.py + pip install fsspec + pip install rich + pip install flax + - name: Checkout PyTorch Repo + if: inputs.has_code_changes == 'true' + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + path: pytorch + ref: ${{ inputs.torch-commit }} + - name: Checkout PyTorch/XLA Repo + if: inputs.has_code_changes == 'true' + uses: actions/checkout@v4 + with: + path: pytorch/xla + - name: Extra CI deps + if: inputs.has_code_changes == 'true' + shell: bash + run: | + set -x - pip install expecttest unittest-xml-reporting - pip install --pre 'torch_xla[pallas]' --index-url https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html + pip install expecttest unittest-xml-reporting + pip install --pre 'torch_xla[pallas]' --index-url https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html - if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then - pip install -r pytorch/xla/benchmarks/requirements.txt - fi - - name: Test - if: inputs.has_code_changes == 'true' - shell: bash - run: pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE - - name: Upload coverage results - if: inputs.has_code_changes == 'true' && inputs.collect-coverage - shell: bash - env: - CIRCLE_WORKFLOW_ID: ${{ github.run_id }} - CIRCLE_BUILD_NUM: ${{ github.run_number }} - BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }} - PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_XLA_OP_TESTS4 }}${{ env.RUN_XLA_OP_TESTS5 }}${{ env.RUN_TORCH_MP_OP_TESTS }} - CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS }} - run: | - # TODO(yeounoh) collect coverage report as needed. - if [ -n "${BENCHMARK_TEST_NAME}" ]; then - exit 0 - fi - docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}" - if [ -n "${PYTHON_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out - fi + if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then + pip install -r pytorch/xla/benchmarks/requirements.txt + fi + - name: Test + if: inputs.has_code_changes == 'true' + shell: bash + run: pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE + - name: Upload coverage results + if: inputs.has_code_changes == 'true' && inputs.collect-coverage + shell: bash + env: + CIRCLE_WORKFLOW_ID: ${{ github.run_id }} + CIRCLE_BUILD_NUM: ${{ github.run_number }} + BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }} + PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_XLA_OP_TESTS4 }}${{ env.RUN_XLA_OP_TESTS5 }}${{ env.RUN_TORCH_MP_OP_TESTS }} + CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS }} + run: | + # TODO(yeounoh) collect coverage report as needed. + if [ -n "${BENCHMARK_TEST_NAME}" ]; then + exit 0 + fi + docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}" + if [ -n "${PYTHON_TEST_NAME}" ]; then + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out + fi - if [ -n "${CPP_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out - fi + if [ -n "${CPP_TEST_NAME}" ]; then + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out + fi - if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then - ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' - echo $ABS_METADATA > abs_metadata.json - gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json + if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then + ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' + echo $ABS_METADATA > abs_metadata.json + gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json - INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' - echo $INC_METADATA > inc_metadata.json - gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json - fi - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." + INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' + echo $INC_METADATA > inc_metadata.json + gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json + fi + - name: Report no code changes + if: inputs.has_code_changes == 'false' + run: | + echo "No code changes were detected that require running the full test suite." diff --git a/.github/workflows/_torchprime_ci.yml b/.github/workflows/_torchprime_ci.yml index 55ab65193df..93172a7492a 100644 --- a/.github/workflows/_torchprime_ci.yml +++ b/.github/workflows/_torchprime_ci.yml @@ -1,115 +1,115 @@ name: torchprime E2E tests description: | - This workflow builds a docker image with the PyTorch/XLA wheels and then - triggers a torchprime (https://github.com/AI-Hypercomputer/torchprime) - E2E test using that docker image. It is intended to catch performance - regressions and API breaking changes in PyTorch/XLA pull requests. + This workflow builds a docker image with the PyTorch/XLA wheels and then + triggers a torchprime (https://github.com/AI-Hypercomputer/torchprime) + E2E test using that docker image. It is intended to catch performance + regressions and API breaking changes in PyTorch/XLA pull requests. on: - workflow_call: - inputs: - timeout-minutes: - required: false - type: number - description: Timeout in minutes for the job run - default: 80 - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' - secrets: - # This is a token for the `torchxlabot2` user, which has access to the torchprime repo. - # It is used to trigger the torchprime E2E test workflow. - # The token should be managed in the "Settings > Secrets and variables > Actions" - # section of the repo. - TORCH_XLA_BOT_TOKEN: - required: true - GCLOUD_SERVICE_KEY: - required: true + workflow_call: + inputs: + timeout-minutes: + required: false + type: number + description: Timeout in minutes for the job run + default: 80 + has_code_changes: + required: false + type: string + description: Whether to run full workflow or not + default: "true" + secrets: + # This is a token for the `torchxlabot2` user, which has access to the torchprime repo. + # It is used to trigger the torchprime E2E test workflow. + # The token should be managed in the "Settings > Secrets and variables > Actions" + # section of the repo. + TORCH_XLA_BOT_TOKEN: + required: true + GCLOUD_SERVICE_KEY: + required: true jobs: - torchprime-e2e-test: - name: Run torchprime E2E tests - timeout-minutes: ${{ inputs.timeout-minutes }} - runs-on: ubuntu-22.04 - steps: - - name: Use Docker in rootless mode - if: inputs.has_code_changes == 'true' - uses: ScribeMD/rootless-docker@0.2.2 - - name: Add user to docker group - if: inputs.has_code_changes == 'true' - run: | - sudo usermod -aG docker $USER - newgrp docker - shell: bash - # Googlers: if this fails, follow go/ptxla-sa-key to debug. - - uses: google-github-actions/auth@v2 - if: inputs.has_code_changes == 'true' - with: - credentials_json: '${{ secrets.GCLOUD_SERVICE_KEY }}' - - uses: google-github-actions/setup-gcloud@v2 - if: inputs.has_code_changes == 'true' - with: - version: '>= 363.0.0' - install_components: 'beta,gke-gcloud-auth-plugin' - - name: Verify GCP setup - if: inputs.has_code_changes == 'true' - run: gcloud info - shell: bash - - name: Authenticate Docker - if: inputs.has_code_changes == 'true' - run: gcloud auth configure-docker --quiet - shell: bash - - name: Activate SA credentials - if: inputs.has_code_changes == 'true' - run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS - shell: bash - - name: Checkout infra - if: inputs.has_code_changes == 'true' - uses: actions/checkout@v4 - with: - sparse-checkout: | - infra - fetch-depth: 1 - path: pytorch-xla - # Build a docker image for torchprime E2E test - # First download the torch-xla-wheels - - name: Fetch wheels - if: inputs.has_code_changes == 'true' - uses: actions/download-artifact@v4 - with: - name: torch-xla-wheels - path: /tmp/wheels/ - # Generate a 16-character random ID for the docker tag - - name: Generate random docker tag - if: inputs.has_code_changes == 'true' - id: random_tag - shell: bash - run: | - echo "random_id=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT - # Then run docker to install them and push a docker - - name: Build and push docker image - if: inputs.has_code_changes == 'true' - id: build_docker - shell: bash - working-directory: pytorch-xla - run: | - . ./infra/ansible/publish_torchprime_e2e_test_docker.sh - echo "docker_url=gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" >> $GITHUB_OUTPUT - env: - DEFAULT_CONTEXT_PATH: /tmp/wheels - DOCKER_IMAGE_NAME: for-torchprime-ci - DOCKER_IMAGE_TAG: ${{ steps.random_tag.outputs.random_id }} - DOCKER_PROJECT: tpu-pytorch - # Trigger torchprime E2E test workflow. - # (Googlers only) in case of infra failure, refer to go/ptxla-torchprime-trigger - # Refer to the same doc on the retention policy of the docker images. - - uses: convictional/trigger-workflow-and-wait@v1.6.5 - if: inputs.has_code_changes == 'true' - with: - owner: AI-Hypercomputer - repo: torchprime - github_token: ${{ secrets.TORCH_XLA_BOT_TOKEN }} - workflow_file_name: e2e_test.yml - wait_interval: 60 - ref: main - client_payload: '{"docker_url": "${{ steps.build_docker.outputs.docker_url }}"}' + torchprime-e2e-test: + name: Run torchprime E2E tests + timeout-minutes: ${{ inputs.timeout-minutes }} + runs-on: ubuntu-22.04 + steps: + - name: Use Docker in rootless mode + if: inputs.has_code_changes == 'true' + uses: ScribeMD/rootless-docker@0.2.2 + - name: Add user to docker group + if: inputs.has_code_changes == 'true' + run: | + sudo usermod -aG docker $USER + newgrp docker + shell: bash + # Googlers: if this fails, follow go/ptxla-sa-key to debug. + - uses: google-github-actions/auth@v2 + if: inputs.has_code_changes == 'true' + with: + credentials_json: "${{ secrets.GCLOUD_SERVICE_KEY }}" + - uses: google-github-actions/setup-gcloud@v2 + if: inputs.has_code_changes == 'true' + with: + version: ">= 363.0.0" + install_components: "beta,gke-gcloud-auth-plugin" + - name: Verify GCP setup + if: inputs.has_code_changes == 'true' + run: gcloud info + shell: bash + - name: Authenticate Docker + if: inputs.has_code_changes == 'true' + run: gcloud auth configure-docker --quiet + shell: bash + - name: Activate SA credentials + if: inputs.has_code_changes == 'true' + run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS + shell: bash + - name: Checkout infra + if: inputs.has_code_changes == 'true' + uses: actions/checkout@v4 + with: + sparse-checkout: | + infra + fetch-depth: 1 + path: pytorch-xla + # Build a docker image for torchprime E2E test + # First download the torch-xla-wheels + - name: Fetch wheels + if: inputs.has_code_changes == 'true' + uses: actions/download-artifact@v5 + with: + name: torch-xla-wheels + path: /tmp/wheels/ + # Generate a 16-character random ID for the docker tag + - name: Generate random docker tag + if: inputs.has_code_changes == 'true' + id: random_tag + shell: bash + run: | + echo "random_id=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT + # Then run docker to install them and push a docker + - name: Build and push docker image + if: inputs.has_code_changes == 'true' + id: build_docker + shell: bash + working-directory: pytorch-xla + run: | + . ./infra/ansible/publish_torchprime_e2e_test_docker.sh + echo "docker_url=gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" >> $GITHUB_OUTPUT + env: + DEFAULT_CONTEXT_PATH: /tmp/wheels + DOCKER_IMAGE_NAME: for-torchprime-ci + DOCKER_IMAGE_TAG: ${{ steps.random_tag.outputs.random_id }} + DOCKER_PROJECT: tpu-pytorch + # Trigger torchprime E2E test workflow. + # (Googlers only) in case of infra failure, refer to go/ptxla-torchprime-trigger + # Refer to the same doc on the retention policy of the docker images. + - uses: convictional/trigger-workflow-and-wait@v1.6.5 + if: inputs.has_code_changes == 'true' + with: + owner: AI-Hypercomputer + repo: torchprime + github_token: ${{ secrets.TORCH_XLA_BOT_TOKEN }} + workflow_file_name: e2e_test.yml + wait_interval: 60 + ref: main + client_payload: '{"docker_url": "${{ steps.build_docker.outputs.docker_url }}"}' diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml index e1d6fdb8599..c5c9a9ad1e6 100644 --- a/.github/workflows/setup/action.yml +++ b/.github/workflows/setup/action.yml @@ -1,53 +1,53 @@ name: Set up PyTorch/XLA inputs: - torch-commit: - type: string - description: PyTorch commit to check out, if provided - wheels-artifact: - type: string - description: | - Artifact containing `torch` (cpu) and `torch-xla` wheels to install + torch-commit: + type: string + description: PyTorch commit to check out, if provided + wheels-artifact: + type: string + description: | + Artifact containing `torch` (cpu) and `torch-xla` wheels to install runs: - using: "composite" - steps: - # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802 - - name: Clean up workspace - shell: bash - run: | - ls -la - rm -rvf ${GITHUB_WORKSPACE}/* - - name: Setup gcloud - shell: bash - run: | - echo "${GCLOUD_SERVICE_KEY}" > /tmp/default_credentials.json - echo "GOOGLE_APPLICATION_CREDENTIALS=/tmp/default_credentials.json" >> $GITHUB_ENV - # GCLOUD_SERVICE_KEY needs to be set from the outside because for some - # reason composite actions don't support secrets. - # https://docs.github.com/en/actions/using-workflows/avoiding-duplication - if: ${{ env.GCLOUD_SERVICE_KEY }} - - name: Checkout PyTorch Repo - uses: actions/checkout@v4 - with: - repository: pytorch/pytorch - path: pytorch - ref: ${{ inputs.torch-commit }} - submodules: recursive - if: ${{ inputs.torch-commit }} - - name: Checkout PyTorch/XLA Repo - uses: actions/checkout@v4 - with: - path: pytorch/xla - - name: Fetch PyTorch/XLA packages - uses: actions/download-artifact@v4 - with: - name: ${{ inputs.wheels-artifact }} - path: /tmp/wheels/ - if: ${{ inputs.wheels-artifact }} - - name: Install wheels - shell: bash - run: | - pip install /tmp/wheels/*.whl + using: "composite" + steps: + # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802 + - name: Clean up workspace + shell: bash + run: | + ls -la + rm -rvf ${GITHUB_WORKSPACE}/* + - name: Setup gcloud + shell: bash + run: | + echo "${GCLOUD_SERVICE_KEY}" > /tmp/default_credentials.json + echo "GOOGLE_APPLICATION_CREDENTIALS=/tmp/default_credentials.json" >> $GITHUB_ENV + # GCLOUD_SERVICE_KEY needs to be set from the outside because for some + # reason composite actions don't support secrets. + # https://docs.github.com/en/actions/using-workflows/avoiding-duplication + if: ${{ env.GCLOUD_SERVICE_KEY }} + - name: Checkout PyTorch Repo + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + path: pytorch + ref: ${{ inputs.torch-commit }} + submodules: recursive + if: ${{ inputs.torch-commit }} + - name: Checkout PyTorch/XLA Repo + uses: actions/checkout@v4 + with: + path: pytorch/xla + - name: Fetch PyTorch/XLA packages + uses: actions/download-artifact@v5 + with: + name: ${{ inputs.wheels-artifact }} + path: /tmp/wheels/ + if: ${{ inputs.wheels-artifact }} + - name: Install wheels + shell: bash + run: | + pip install /tmp/wheels/*.whl - echo "Import check..." - python -c "import torch_xla" - if: ${{ inputs.wheels-artifact }} + echo "Import check..." + python -c "import torch_xla" + if: ${{ inputs.wheels-artifact }} From 79192c0814c8f5f66bb475e60471f22a133abbbe Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 22 Sep 2025 22:17:54 +0000 Subject: [PATCH 3/4] Add fmt --- WORKSPACE | 15 ++++++++++++++- bazel/torch.BUILD | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/WORKSPACE b/WORKSPACE index 70b7d9cc098..9567f879660 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,4 +1,5 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") ################################ Python Setup ################################ @@ -9,6 +10,18 @@ http_archive( urls = ["https://github.com/pybind/pybind11_bazel/archive/fc56ce8a8b51e3dd941139d329b63ccfea1d304b.zip"], ) +git_repository( + name = "fmt", + branch = "master", + patch_cmds = [ + "mv support/bazel/.bazelrc .bazelrc", + "mv support/bazel/.bazelversion .bazelversion", + "mv support/bazel/BUILD.bazel BUILD.bazel", + "mv support/bazel/WORKSPACE.bazel WORKSPACE.bazel", + ], + remote = "https://github.com/fmtlib/fmt", +) + http_archive( name = "pybind11", build_file = "@pybind11_bazel//:pybind11.BUILD", @@ -82,7 +95,7 @@ http_archive( # Initialize OpenXLA's external dependencies. There is an specific order # which those dependencies are initialized, because for bazel it's the # first definition that takes precedence. -# We follow what openxla/xla does exactly: +# We follow what openxla/xla does exactly: # https://github.com/openxla/xla/blob/main/WORKSPACE#L37 load("@xla//:workspace4.bzl", "xla_workspace4") diff --git a/bazel/torch.BUILD b/bazel/torch.BUILD index afc6bb57af9..cfbd620e077 100644 --- a/bazel/torch.BUILD +++ b/bazel/torch.BUILD @@ -10,6 +10,9 @@ cc_library( ["torch/include/**/*.h"], ["torch/include/google/protobuf/**/*.h"], ), + deps = [ + "@fmt", + ], strip_include_prefix = "torch/include", ) From 61dbc117c56fb255ea207886cf985e46eacc5a58 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 22 Sep 2025 23:37:30 +0000 Subject: [PATCH 4/4] Add fmt build file --- WORKSPACE | 19 ++++++------------- bazel/fmt.BUILD | 9 +++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) create mode 100644 bazel/fmt.BUILD diff --git a/WORKSPACE b/WORKSPACE index 9567f879660..78e928d2a0f 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,5 +1,4 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") -load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") ################################ Python Setup ################################ @@ -10,18 +9,6 @@ http_archive( urls = ["https://github.com/pybind/pybind11_bazel/archive/fc56ce8a8b51e3dd941139d329b63ccfea1d304b.zip"], ) -git_repository( - name = "fmt", - branch = "master", - patch_cmds = [ - "mv support/bazel/.bazelrc .bazelrc", - "mv support/bazel/.bazelversion .bazelversion", - "mv support/bazel/BUILD.bazel BUILD.bazel", - "mv support/bazel/WORKSPACE.bazel WORKSPACE.bazel", - ], - remote = "https://github.com/fmtlib/fmt", -) - http_archive( name = "pybind11", build_file = "@pybind11_bazel//:pybind11.BUILD", @@ -55,6 +42,12 @@ new_local_repository( path = PYTORCH_LOCAL_DIR, ) +new_local_repository( + name = "fmt", + build_file = "//bazel:fmt.BUILD", + path = PYTORCH_LOCAL_DIR + "/third_party/fmt", +) + ############################# OpenXLA Setup ############################### # To build PyTorch/XLA with a new revison of OpenXLA, update the xla_hash to diff --git a/bazel/fmt.BUILD b/bazel/fmt.BUILD new file mode 100644 index 00000000000..ea8c566b98a --- /dev/null +++ b/bazel/fmt.BUILD @@ -0,0 +1,9 @@ +load("@rules_cc//cc:defs.bzl", "cc_library") + +cc_library( + name = "fmt", + hdrs = glob(["include/fmt/*.h",]), + defines = ["FMT_HEADER_ONLY=1"], + includes = ["include"], + visibility = ["//visibility:public"], +)