From 8fbc305bce39f62184a6fc8dc9bf3888ae274428 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Mon, 28 Apr 2025 03:22:59 +0000 Subject: [PATCH 1/9] NPU add titan test --- .github/workflows/_ascend_npu_torchtitan.yml | 2 +- .github/workflows/ascend_npu_test.yml | 25 ++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 07f554a..0b1a9cf 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -127,7 +127,7 @@ jobs: - name: Run torchtitan integration_test working-directory: torchtitan - run: | + run: | npu_count=$(python -c "import torch; print(torch.npu.device_count())") python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu ${npu_count} diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index e6eda7c..ee06db5 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -120,6 +120,25 @@ jobs: image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + benchmark: + name: Run benchmarks + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_benchmark.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + device: ${{ needs.prepare.outputs.device }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + secrets: + pr-token: ${{ secrets.COSDT_BOT_TOKEN }} + torchtitan: name: Run torchtitan needs: @@ -131,8 +150,10 @@ jobs: (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) uses: ./.github/workflows/_ascend_npu_torchtitan.yml with: - runner: ${{ needs.prepare.outputs.runner }} + runner: "linux-arm64-npu-4" image: ${{ needs.prepare.outputs.image }} + device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} - + secrets: + pr-token: ${{ secrets.COSDT_BOT_TOKEN }} From c253f09e5d491c1866b91267ab4192d8185ff1a1 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Mon, 28 Apr 2025 03:48:48 +0000 Subject: [PATCH 2/9] NPU add torchtitan test --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index ee06db5..5783073 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -56,7 +56,7 @@ on: - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - ascendai/cann:latest - default: "ascendai/cann:latest" + default: "ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10" description: "The docker image which will be loaded" # Only cancel the previous runs when triggered by a pull_request event From 29f77f446402300d3e05ebf5680b3f36f934944d Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Mon, 28 Apr 2025 04:46:47 +0000 Subject: [PATCH 3/9] NPU add titan test --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 5783073..5137bad 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -56,7 +56,7 @@ on: - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - ascendai/cann:latest - default: "ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10" + default: "ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8" description: "The docker image which will be loaded" # Only cancel the previous runs when triggered by a pull_request event From 964ca7613ccc84195ccf898fae5608e8adad6bb2 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Thu, 29 May 2025 01:29:33 +0000 Subject: [PATCH 4/9] add torchtitan --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 5137bad..ee06db5 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -56,7 +56,7 @@ on: - ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8 - ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 - ascendai/cann:latest - default: "ascendai/cann:7.0.1-910b-ubuntu22.04-py3.8" + default: "ascendai/cann:latest" description: "The docker image which will be loaded" # Only cancel the previous runs when triggered by a pull_request event From 7a515a4a2dada6fa82a845136421b68539fd7eb9 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Tue, 3 Jun 2025 09:19:32 +0000 Subject: [PATCH 5/9] titan test --- .github/workflows/ascend_npu_test.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index ee06db5..95b50b2 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -120,6 +120,25 @@ jobs: image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} +<<<<<<< HEAD +======= + test: + name: Test torch_npu + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_ut.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + device: ${{ needs.prepare.outputs.device }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + benchmark: name: Run benchmarks needs: From 1c6cf7fdcef45c3e291c4b791866370280e51492 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Fri, 6 Jun 2025 07:33:58 +0000 Subject: [PATCH 6/9] after rebasing --- .github/workflows/_ascend_npu_torchtitan.yml | 12 ------------ .github/workflows/ascend_npu_test.yml | 5 ----- 2 files changed, 17 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 0b1a9cf..515ea58 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -11,10 +11,6 @@ on: required: true type: string description: "The docker image which will be loaded" - device: - required: true - type: string - description: "The device selected to run on" torch-artifact: required: false type: string @@ -28,14 +24,6 @@ on: description: "A token used to create a pull request" required: true -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. - -defaults: - run: - shell: bash -el {0} - jobs: setup_environment: name: run torchtitan tests diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 95b50b2..f23f96d 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -120,8 +120,6 @@ jobs: image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} -<<<<<<< HEAD -======= test: name: Test torch_npu needs: @@ -135,7 +133,6 @@ jobs: with: runner: ${{ needs.prepare.outputs.runner }} image: ${{ needs.prepare.outputs.image }} - device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} @@ -152,7 +149,6 @@ jobs: with: runner: ${{ needs.prepare.outputs.runner }} image: ${{ needs.prepare.outputs.image }} - device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} secrets: @@ -171,7 +167,6 @@ jobs: with: runner: "linux-arm64-npu-4" image: ${{ needs.prepare.outputs.image }} - device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} secrets: From ae72ad32b952a011f64d36757ee1bc10abb5647b Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Sat, 7 Jun 2025 11:02:32 +0000 Subject: [PATCH 7/9] use 2 npus --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index f23f96d..8454625 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -165,7 +165,7 @@ jobs: (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) uses: ./.github/workflows/_ascend_npu_torchtitan.yml with: - runner: "linux-arm64-npu-4" + runner: "linux-arm64-npu-2" image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} From 43b0bfd4e89b23d3e5a428c16e7468d0c095dad0 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Sun, 8 Jun 2025 06:28:20 +0000 Subject: [PATCH 8/9] add installments --- .github/workflows/_ascend_npu_torchtitan.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 515ea58..914d7ca 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -104,6 +104,10 @@ jobs: run: | pip install -r requirements.txt pip install pytest pytest-cov tyro + pip install torchdata>=0.8.0 + pip install datasets>=3.6.0 + pip install tomli>=1.1.0 + pip install tensorboard tiktoken blobfile tabulate wandb fsspec tyro - name: Show environment info run: | From 99b1bb629459e458b637ef410133c60d711dfef5 Mon Sep 17 00:00:00 2001 From: lowdy1 Date: Tue, 10 Jun 2025 02:32:54 +0000 Subject: [PATCH 9/9] change runner --- .github/workflows/ascend_npu_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index 8454625..d7b1c9a 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -165,7 +165,7 @@ jobs: (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) uses: ./.github/workflows/_ascend_npu_torchtitan.yml with: - runner: "linux-arm64-npu-2" + runner: ${{ needs.prepare.outputs.runner }} image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }}