diff --git a/.github/workflows/ce_daily_dev.yml b/.github/workflows/ce_daily_dev.yml index c53d7f1b6..6745d55f5 100644 --- a/.github/workflows/ce_daily_dev.yml +++ b/.github/workflows/ce_daily_dev.yml @@ -1,6 +1,9 @@ name: CE Daily Dev on: + pull_request: + types: [opened, synchronize] + branches: [develop] schedule: - cron: '0 19 * * *' workflow_dispatch: # 可选:允许手动点 Run @@ -44,1083 +47,28 @@ defaults: shell: bash jobs: - single_card_test: - name: Unit test (single card) - runs-on: - group: Fleet-H-single-card - strategy: - fail-fast: false - max-parallel: 2 - matrix: - include: - - cuda: "12.6" - python: "3.10" - - cuda: "12.9" - python: "3.10" - - cuda: "13.0" - python: "3.10" - - cuda: "12.9" - python: "3.11" - - cuda: "12.9" - python: "3.12" - - cuda: "12.9" - python: "3.13" - env: - PIP_CACHE_DIR: /home/.cache/pip - CACHE_DIR: /home/.cache - TASK: paddlefleet-CE-${{ matrix.cuda }}-${{ matrix.python }}-DEV-single-card-test - steps: - - name: Determine the runner - run: | - gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) - cuda=${{ matrix.cuda }} - python_version=${{ matrix.python }} - echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV - if [ "${cuda}" == "12.9" ]; then - docker_image=${docker_image_cu129} - cuda_version="cu129" - paddle_url=${paddle_url_cu129} - elif [ "${cuda}" == "13.0" ]; then - docker_image=${docker_image_cu130} - cuda_version="cu130" - paddle_url=${paddle_url_cu130} - else - docker_image=${docker_image_cu126} - cuda_version="cu126" - paddle_url=${paddle_url_cu126} - fi - echo "DOCKER_IMAGE=${docker_image}" >> $GITHUB_ENV - echo "CUDA_VERSION=${cuda_version}" >> $GITHUB_ENV - echo "PYTHON_VERSION=${python_version}" >> $GITHUB_ENV - echo "PADDLE_URL=${paddle_url}" >> $GITHUB_ENV - - name: Check docker image and run container - env: - GPU_DEVICES: ${{ env.GPU_DEVICES }} - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE }} - CUDA_VERSION: ${{ env.CUDA_VERSION }} - PYTHON_VERSION: ${{ env.PYTHON_VERSION }} - PADDLE_URL: ${{ env.PADDLE_URL }} - - run: | - container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - echo "container_name=${container_name}" >> ${{ github.env }} - docker pull ${DOCKER_IMAGE} - docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ - -v "/dev/shm:/dev/shm" \ - -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - -v ${{ github.workspace }}/../../..:/root \ - -v /ssd1/paddle-1/action_cache:/home/.cache \ - -v ${{ github.workspace }}:/paddle \ - -e BRANCH \ - -e PR_ID \ - -e COMMIT_ID \ - -e PADDLE_ROOT \ - -e ci_scripts \ - -e CACHE_DIR \ - -e no_proxy \ - -e use_release \ - -e repo_flag="paddlefleet" \ - -e PIP_CACHE_DIR \ - -e work_dir \ - -e CUDA_VERSION \ - -e PYTHON_VERSION \ - -e PADDLE_URL \ - -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - -e GITHUB_REPO_NAME="${{ github.repository }}" \ - -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - -e GITHUB_RUN_ID="${{ github.run_id }}" \ - -w /paddle --network host ${DOCKER_IMAGE} - - - - name: Single card test - run: | - docker exec -t ${{ env.container_name }} /bin/bash -xce ' - pwd - find . -maxdepth 1 -name "--*" -delete - rm -rf * .[^.]* - source /root/proxy - mkdir -p /home/.cache/pip - pip cache dir - if [ ${PYTHON_VERSION} != "3.10" ]; then - wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda -u - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r - conda init bash - conda create -n python${PYTHON_VERSION} python=${PYTHON_VERSION} -y - conda activate python${PYTHON_VERSION} - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - else - paddle_url=${PADDLE_URL} - fi - if [ ${CUDA_VERSION} == "cu129" ]; then - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - fi - echo "Install uv" - export WITH_COVERAGE=ON - pip install --upgrade pip - git clone https://github.com/PaddlePaddle/PaddleFleet.git - git config --global --add safe.directory /paddle/PaddleFleet - cd PaddleFleet - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - pip install colorlog>=6.10.1 - python -m pip install --pre paddlefleet --index-url https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/${CUDA_VERSION}/ --extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple - if [ "${use_release}" == "true" ]; then - echo "Using pre-built paddle package from develop branch." - pip install ${paddle_url} --index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --force-reinstall --no-cache-dir - fi - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt matplotlib==3.10.8 pytest parameterized - echo "Paddle Commit" - python -c "import paddle; print(paddle.version.commit)" - echo "PaddleFleet Commit" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - export PYTHONPATH=/paddle/PaddleFleet:$PYTHONPATH - bash ci/single_card_test.sh - single_card_exit_code=$? - if [[ "$single_card_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mSingle card test failed.\033[0m" - exit 1 - else - echo -e "\033[32mSingle card test succeeded.\033[0m" - fi - ' - - - name: Single card sonic moe test - if: matrix.python == '3.12' && matrix.cuda == '12.9' && always() - run: | - docker exec -t ${{ env.container_name }} /bin/bash -xce ' - pwd - source /root/proxy - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${{ matrix.python }} - echo "Paddle Commit" - python -c "import paddle; print(paddle.version.commit)" - echo "PaddleFleet Commit" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - cd PaddleFleet - bash ci/single_card_sonic.sh - single_card_exit_code=$? - if [[ "$single_card_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mSingle card test failed.\033[0m" - exit 1 - else - echo -e "\033[32mSingle card test succeeded.\033[0m" - fi - ' - - - name: Terminate and delete the container - if: ${{ always() }} - run: | - set +e - docker exec -t ${{ env.container_name }} /bin/bash -c 'bash ci/clean_uv_cache.sh; rm -rf * .[^.]*' - docker rm -f ${{ env.container_name }} - - multi-card_test: - name: Unit test (multi-card) - runs-on: - group: Fleet-H-multi-card - strategy: - fail-fast: false - max-parallel: 1 - matrix: - include: - - cuda: "12.6" - python: "3.10" - - cuda: "12.9" - python: "3.10" - - cuda: "13.0" - python: "3.10" - - cuda: "12.9" - python: "3.11" - - cuda: "12.9" - python: "3.12" - - cuda: "12.9" - python: "3.13" - env: - PIP_CACHE_DIR: /home/.cache/pip - TASK: paddlefleet-CE-${{ matrix.cuda }}-${{ matrix.python }}-multi-card_test-dev - steps: - - name: setup cuda and python - run: | - cuda=${{ matrix.cuda }} - python_version=${{ matrix.python }} - if [ "${cuda}" == "12.9" ]; then - docker_image=${docker_image_cu129} - cuda_version="cu129" - paddle_url=${paddle_url_cu129} - elif [ "${cuda}" == "13.0" ]; then - docker_image=${docker_image_cu130} - cuda_version="cu130" - paddle_url=${paddle_url_cu130} - else - docker_image=${docker_image_cu126} - cuda_version="cu126" - paddle_url=${paddle_url_cu126} - fi - echo "DOCKER_IMAGE=${docker_image}" >> $GITHUB_ENV - echo "CUDA_VERSION=${cuda_version}" >> $GITHUB_ENV - echo "PYTHON_VERSION=${python_version}" >> $GITHUB_ENV - echo "PADDLE_URL=${paddle_url}" >> $GITHUB_ENV - - name: Check docker image and run container - env: - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE }} - CUDA_VERSION: ${{ env.CUDA_VERSION }} - PYTHON_VERSION: ${{ env.PYTHON_VERSION }} - PADDLE_URL: ${{ env.PADDLE_URL }} - run: | - container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - echo "container_name=${container_name}" >> ${{ github.env }} - docker pull ${DOCKER_IMAGE} - docker run -d -t --gpus all --name ${container_name} \ - -v "/dev/shm:/dev/shm" \ - -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - -v ${{ github.workspace }}/../../..:/root \ - -v /ssd1/paddle-1/action_cache:/home/.cache \ - -v ${{ github.workspace }}:/paddle \ - -e BRANCH \ - -e PR_ID \ - -e COMMIT_ID \ - -e PADDLE_ROOT \ - -e ci_scripts \ - -e CACHE_DIR \ - -e no_proxy \ - -e PIP_CACHE_DIR \ - -e repo_flag="paddlefleet" \ - -e use_release \ - -e work_dir \ - -e CUDA_VERSION \ - -e PYTHON_VERSION \ - -e PADDLE_URL \ - -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - -e GITHUB_REPO_NAME="${{ github.repository }}" \ - -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - -e GITHUB_RUN_ID="${{ github.run_id }}" \ - -w /paddle --network host ${DOCKER_IMAGE} - - - name: Install PaddleFleet - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - find . -maxdepth 1 -name "--*" -delete - rm -rf * .[^.]* - source /root/proxy - pip install --upgrade pip - if [ ${PYTHON_VERSION} != "3.10" ]; then - wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda -u - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r - conda init bash - conda create -n python${PYTHON_VERSION} python=${PYTHON_VERSION} -y - conda activate python${PYTHON_VERSION} - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - else - paddle_url=${PADDLE_URL} - fi - if [ ${CUDA_VERSION} == "cu129" ]; then - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - fi - git clone https://github.com/PaddlePaddle/PaddleFleet.git - cd PaddleFleet - git config --global --add safe.directory /paddle/PaddleFleet - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - git submodule update --init --recursive - pip install colorlog>=6.10.1 - pip install nvidia-cutlass-dsl==4.2.1 - python -m pip install --pre paddlefleet --index-url https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/${CUDA_VERSION}/ --extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple - if [ "${use_release}" == "true" ]; then - echo "Using pre-built paddle package from develop branch." - pip install ${paddle_url} --index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --force-reinstall --no-cache-dir - fi - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt pytest matplotlib==3.10.8 - wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq - chmod +x /usr/local/bin/yq - ' - - - name: Multi-card test - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - if [ ${PYTHON_VERSION} != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - echo "Paddle Commit" - python -c "import paddle; print(paddle.version.commit)" - echo "PaddleFleet Commit" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - export PYTHONPATH=/paddle/PaddleFleet:$PYTHONPATH - bash PaddleFleet/ci/multi-card_test.sh - multi_card_exit_code=$? - if [[ "$multi_card_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mMulti card test failed.\033[0m" - exit 1 - else - echo -e "\033[32mMulti card test succeeded.\033[0m" - fi - ' - - - name: Terminate and delete the container - if: ${{ always() }} - run: | - set +e - docker exec -t ${{ env.container_name }} /bin/bash -c 'bash ci/clean_uv_cache.sh; rm -rf * .[^.]*' - docker rm -f ${{ env.container_name }} - - - integration-test-H20-single-card: - name: Integration test (H20, single card) - runs-on: - group: Fleet-H-single-card - strategy: - fail-fast: false - max-parallel: 2 - matrix: - include: - - cuda: "12.6" - python: "3.10" - - cuda: "12.9" - python: "3.10" - - cuda: "13.0" - python: "3.10" - - cuda: "12.9" - python: "3.11" - - cuda: "12.9" - python: "3.12" - - cuda: "12.9" - python: "3.13" - env: - PIP_CACHE_DIR: /home/.cache/pip - CACHE_DIR: /home/.cache - TASK: paddlefleet-CE-${{ matrix.cuda }}-${{ matrix.python }}-DEV-integration-test-single-card-release - steps: - - name: Determine the runner - run: | - gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) - cuda=${{ matrix.cuda }} - python_version=${{ matrix.python }} - echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV - if [ "${cuda}" == "12.9" ]; then - docker_image=${docker_image_cu129} - cuda_version="cu129" - paddle_url=${paddle_url_cu129} - elif [ "${cuda}" == "13.0" ]; then - docker_image=${docker_image_cu130} - cuda_version="cu130" - paddle_url=${paddle_url_cu130} - else - docker_image=${docker_image_cu126} - cuda_version="cu126" - paddle_url=${paddle_url_cu126} - fi - echo "DOCKER_IMAGE=${docker_image}" >> $GITHUB_ENV - echo "CUDA_VERSION=${cuda_version}" >> $GITHUB_ENV - echo "PYTHON_VERSION=${python_version}" >> $GITHUB_ENV - echo "PADDLE_URL=${paddle_url}" >> $GITHUB_ENV - echo "BASE_NAME=${cuda_version}-${python_version}-H20" >> $GITHUB_ENV - - name: Check docker image and run container - env: - GPU_DEVICES: ${{ env.GPU_DEVICES }} - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE }} - CUDA_VERSION: ${{ env.CUDA_VERSION }} - PYTHON_VERSION: ${{ env.PYTHON_VERSION }} - PADDLE_URL: ${{ env.PADDLE_URL }} - BASE_NAME: ${{ env.BASE_NAME }} - run: | - container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - echo "container_name=${container_name}" >> ${{ github.env }} - docker pull ${DOCKER_IMAGE} - set -x - docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ - -v "/dev/shm:/dev/shm" \ - -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - -v ${{ github.workspace }}/../../..:/root \ - -v /ssd1/paddle-1/action_cache:/home/.cache \ - -v ${{ github.workspace }}:/workspace \ - -e BRANCH \ - -e PR_ID \ - -e COMMIT_ID \ - -e PADDLE_ROOT \ - -e ci_scripts \ - -e CACHE_DIR \ - -e no_proxy \ - -e use_release \ - -e PIP_CACHE_DIR \ - -e CUDA_VERSION \ - -e PYTHON_VERSION \ - -e PADDLE_URL \ - -e BASE_NAME \ - -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - -e GITHUB_REPO_NAME="${{ github.repository }}" \ - -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - -e GITHUB_RUN_ID="${{ github.run_id }}" \ - -w /workspace --network host ${DOCKER_IMAGE} - - - name: Install PaddleFleet - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - nvidia-smi - find . -maxdepth 1 -name "--*" -delete - rm -rf * .[^.]* - source /root/proxy - mkdir -p /home/.cache/pip - pip cache dir - pip install --upgrade pip - if [ ${PYTHON_VERSION} != "3.10" ]; then - wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda -u - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r - conda init bash - conda create -n python${PYTHON_VERSION} python=${PYTHON_VERSION} -y - conda activate python${PYTHON_VERSION} - fi - git clone https://github.com/PaddlePaddle/PaddleFleet.git - cd PaddleFleet - git config --global --add safe.directory /workspace/PaddleFleet - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - pip install colorlog>=6.10.1 - pip install nvidia-cutlass-dsl==4.2.1 - python -m pip install --pre paddlefleet --index-url https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/${CUDA_VERSION}/ --extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple - wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq - chmod +x /usr/local/bin/yq - ' - - - name: Install PaddleFormers - id: formers_install - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - else - paddle_url=${PADDLE_URL} - fi - if [ ${CUDA_VERSION} == "cu129" ]; then - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - fi - if [ ${CUDA_VERSION} == "cu130" ]; then - pip install blinker==1.9.0 --ignore-installed - fi - git clone -b develop https://github.com/PaddlePaddle/PaddleFormers.git - cd PaddleFormers - git config --global --add safe.directory /workspace/PaddleFormers - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - git log -1 - sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py - sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py - pip install -e . --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ - if [ "${use_release}" == "true" ]; then - echo "Using pre-built paddle package from develop branch." - pip install ${paddle_url} --index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --force-reinstall --no-cache-dir - fi - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt matplotlib==3.10.8 pytest parameterized - pip install librosa==0.11.0 - echo "Paddle Commit" - python -c "import paddle; print(paddle.version.commit)" - echo "PaddleFleet Commit" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - echo "paddleformers Commit" - python -c "import paddleformers; print(paddleformers.version.commit)" - ' - - - name: Proprocess for integration test - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/preprocess.sh - preprocess_exit_code=$? - if [[ "$preprocess_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mPreprocess failed.\033[0m" - exit 1 - else - echo -e "\033[32mPreprocess succeeded.\033[0m" - fi - ' - - - name: Integration test (GLM4.5 single-card) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh - glm45_single_card_exit_code=$? - if [[ "$glm45_single_card_exit_code" != "0" ]]; then - export case_name="glm45_single_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m" - fi - ' - - - name: Integration test (Qwen3-30B-A3B single-card) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh - qwen3_single_card_exit_code=$? - if [[ "$qwen3_single_card_exit_code" != "0" ]]; then - export case_name="qwen3_single_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m" - fi - ' - - - name: Qwen3-vl-8k-single-card - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft_single_card.sh single - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen3vl_sft_single_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen3-vl-8k-single-card.\033[0m" - fi - ' - - - - name: Terminate and delete the container - if: ${{ always() }} - run: | - set +e - docker exec -t ${{ env.container_name }} /bin/bash -c 'bash PaddleFleet/ci/clean_uv_cache.sh; rm -rf * .[^.]*' - docker rm -f ${{ env.container_name }} - - integration-test-H20-multi-card: - name: Integration test (H20, multi-card) - runs-on: - group: Fleet-H-multi-card - strategy: - fail-fast: false - max-parallel: 1 - matrix: - include: - - cuda: "12.6" - python: "3.10" - - cuda: "12.9" - python: "3.10" - - cuda: "13.0" - python: "3.10" - - cuda: "12.9" - python: "3.11" - - cuda: "12.9" - python: "3.12" - - cuda: "12.9" - python: "3.13" - env: - PIP_CACHE_DIR: /home/.cache/pip - CACHE_DIR: /home/.cache - TASK: paddlefleet-CE-${{ matrix.cuda }}-${{ matrix.python }}-DEV-integration-test-multi-card-release - steps: - - name: setup cuda and python - run: | - cuda=${{ matrix.cuda }} - python_version=${{ matrix.python }} - if [ "${cuda}" == "12.9" ]; then - docker_image=${docker_image_cu129} - cuda_version="cu129" - paddle_url=${paddle_url_cu129} - elif [ "${cuda}" == "13.0" ]; then - docker_image=${docker_image_cu130} - cuda_version="cu130" - paddle_url=${paddle_url_cu130} - else - docker_image=${docker_image_cu126} - cuda_version="cu126" - paddle_url=${paddle_url_cu126} - fi - echo "DOCKER_IMAGE=${docker_image}" >> $GITHUB_ENV - echo "CUDA_VERSION=${cuda_version}" >> $GITHUB_ENV - echo "PYTHON_VERSION=${python_version}" >> $GITHUB_ENV - echo "PADDLE_URL=${paddle_url}" >> $GITHUB_ENV - echo "BASE_NAME=${cuda_version}-${python_version}-H20" >> $GITHUB_ENV - - name: Check docker image and run container - env: - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE }} - CUDA_VERSION: ${{ env.CUDA_VERSION }} - PYTHON_VERSION: ${{ env.PYTHON_VERSION }} - PADDLE_URL: ${{ env.PADDLE_URL }} - BASE_NAME: ${{ env.BASE_NAME }} - run: | - container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - echo "container_name=${container_name}" >> ${{ github.env }} - docker pull ${DOCKER_IMAGE} - docker run -d -t --name ${container_name} --gpus all --shm-size=32G \ - -v "/dev/shm:/dev/shm" \ - -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - -v ${{ github.workspace }}/../../..:/root \ - -v /ssd1/paddle-1/action_cache:/home/.cache \ - -v ${{ github.workspace }}:/workspace \ - -e BRANCH \ - -e PR_ID \ - -e COMMIT_ID \ - -e PADDLE_ROOT \ - -e ci_scripts \ - -e CACHE_DIR \ - -e no_proxy \ - -e use_release \ - -e PIP_CACHE_DIR \ - -e CUDA_VERSION \ - -e PYTHON_VERSION \ - -e PADDLE_URL \ - -e BASE_NAME \ - -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - -e GITHUB_REPO_NAME="${{ github.repository }}" \ - -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - -e GITHUB_RUN_ID="${{ github.run_id }}" \ - -w /workspace --network host ${DOCKER_IMAGE} - - - name: Install PaddleFleet - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - find . -maxdepth 1 -name "--*" -delete - rm -rf * .[^.]* - source /root/proxy - mkdir -p /home/.cache/pip - pip cache dir - pip install --upgrade pip - if [ ${PYTHON_VERSION} != "3.10" ]; then - wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda -u - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r - conda init bash - conda create -n python${PYTHON_VERSION} python=${PYTHON_VERSION} -y - conda activate python${PYTHON_VERSION} - fi - git clone https://github.com/PaddlePaddle/PaddleFleet.git - cd PaddleFleet - git config --global --add safe.directory /workspace/PaddleFleet - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - pip install colorlog>=6.10.1 - python -m pip install --pre paddlefleet --index-url https://www.paddlepaddle.org.cn/packages/nightly/$CUDA_VERSION/ --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/${CUDA_VERSION}/ --extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple - wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq - chmod +x /usr/local/bin/yq - ' - - - name: Install PaddleFormers - id: formers_install - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - else - paddle_url=${PADDLE_URL} - fi - if [ ${CUDA_VERSION} == "cu129" ]; then - paddle_whl=paddlepaddle_gpu-0.0.0-cp${PYTHON_VERSION//./}-cp${PYTHON_VERSION//./}-linux_x86_64.whl - paddle_url="${PADDLE_URL}${paddle_whl}" - fi - if [ ${CUDA_VERSION} == "cu130" ]; then - pip install blinker==1.9.0 --ignore-installed - fi - git clone -b develop https://github.com/PaddlePaddle/PaddleFormers.git - cd PaddleFormers - cp examples/experiments/paddlefleet/glm45.json examples/experiments/paddlefleet/glm45_fp8.json - git config --global --add safe.directory /workspace/PaddleFormers - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - git log -1 - pip install -e . --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ - if [ "${use_release}" == "true" ]; then - echo "Using pre-built paddle package from develop branch." - pip install ${paddle_url} --index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --force-reinstall --no-cache-dir - fi - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt - pip install bce-python-sdk==0.8.74 - pip install coverage==7.6.1 - pip install librosa==0.11.0 - echo "Paddle Commit" - python -c "import paddle; print(paddle.version.commit)" - echo "PaddleFleet Commit" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - echo "paddleformers Commit" - python -c "import paddleformers; print(paddleformers.version.commit)" - ' - - - name: GLM4.5 pre-train - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_pt.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_pt" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" - fi - ' - - - name: GLM4.5 sft - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_sft.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_sft" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" - fi - ' - - - name: GLM4.5 lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_lora.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_lora" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" - fi - ' - - - name: GLM4.5 dpo - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_dpo" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" - fi - ' - - - name: GLM4.5 dpo_lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo_lora.sh - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="glm45_dpo_lora" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo_lora.\033[0m" - fi - ' - - - - name: GLM4.5 pre-train (Grouped GEMM) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${CUDA_VERSION}" == "cu126" ]; then - exit 0 - fi - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_pt_grouped_gemm" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m" - fi - ' - - - name: Integration test (GLM4.5 multi-card FP8) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${CUDA_VERSION}" == "cu126" ]; then - exit 0 - fi - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_pt_fp8" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" - fi - ' - - - name: GLM4.5 pre-train (EP4) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_pt_ep4.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - export case_name="glm45_pt_ep4" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 EP4.\033[0m" - fi - ' - - - name: Qwen pre-train - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/qwen.sh pt - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen_pt" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen pre-train.\033[0m" - fi - ' - - - name: Qwen sft - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/qwen.sh sft - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen_sft" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen sft.\033[0m" - fi - ' - - - name: Qwen lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/qwen.sh lora - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen_lora" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen lora.\033[0m" - fi - ' - - - name: Qwen vl sft - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh tp8 h20 - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen3vl_sft_h20_tp8_multi_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen vl sft.\033[0m" - fi - ' - - - name: Qwen vl lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_lora.sh h20 - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen3vl_lora_h20_multi_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen vl lora.\033[0m" - fi - ' - - - name: Qwen vl moe - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh moe h20 - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen3vl_sft_h20_moe_multi_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen vl moe.\033[0m" - fi - ' - - - name: Qwen3-vl-8k-fsdp - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh fsdp h20 - exit_code=$? - if [[ "$exit_code" != "0" ]]; then - export case_name="qwen3vl_sft_h20_fsdp_multi_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen3-vl-8k-fsdp.\033[0m" - fi - ' - - - name: Terminate and delete the container - if: ${{ always() }} - run: | - set +e - docker exec -t ${{ env.container_name }} /bin/bash -c 'bash PaddleFleet/ci/clean_uv_cache.sh; rm -rf * .[^.]*' - docker rm -f ${{ env.container_name }} - - - integration-test-a100: - name: Integration test (A100) + integration-test-H20-single-card: + name: Integration test (H20, single card) runs-on: - group: Distribute + group: Fleet-H-single-card strategy: fail-fast: false max-parallel: 2 matrix: include: - - cuda: "12.6" - python: "3.10" - - cuda: "12.9" - python: "3.10" - - cuda: "13.0" - python: "3.10" - - cuda: "12.9" - python: "3.11" - cuda: "12.9" python: "3.12" - - cuda: "12.9" - python: "3.13" env: PIP_CACHE_DIR: /home/.cache/pip CACHE_DIR: /home/.cache - TASK: paddlefleet-CE-${{ matrix.cuda }}-${{ matrix.python }}-DEV-integration-test-A100 + TASK: paddlefleet-CE-${{ matrix.cuda }}-${{ matrix.python }}-DEV-integration-test-single-card-release steps: - - name: setup cuda and python + - name: Determine the runner run: | + gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) cuda=${{ matrix.cuda }} python_version=${{ matrix.python }} + echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV if [ "${cuda}" == "12.9" ]; then docker_image=${docker_image_cu129} cuda_version="cu129" @@ -1138,9 +86,10 @@ jobs: echo "CUDA_VERSION=${cuda_version}" >> $GITHUB_ENV echo "PYTHON_VERSION=${python_version}" >> $GITHUB_ENV echo "PADDLE_URL=${paddle_url}" >> $GITHUB_ENV - echo "BASE_NAME=${cuda_version}-${python_version}-A100" >> $GITHUB_ENV + echo "BASE_NAME=${cuda_version}-${python_version}-H20" >> $GITHUB_ENV - name: Check docker image and run container env: + GPU_DEVICES: ${{ env.GPU_DEVICES }} DOCKER_IMAGE: ${{ env.DOCKER_IMAGE }} CUDA_VERSION: ${{ env.CUDA_VERSION }} PYTHON_VERSION: ${{ env.PYTHON_VERSION }} @@ -1150,7 +99,8 @@ jobs: container_name=${TASK}-$(date +%Y%m%d-%H%M%S) echo "container_name=${container_name}" >> ${{ github.env }} docker pull ${DOCKER_IMAGE} - docker run -d -t --name ${container_name} --gpus all --shm-size=32G \ + set -x + docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ -v "/dev/shm:/dev/shm" \ -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ -v ${{ github.workspace }}/../../..:/root \ @@ -1180,6 +130,7 @@ jobs: - name: Install PaddleFleet run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' + nvidia-smi find . -maxdepth 1 -name "--*" -delete rm -rf * .[^.]* source /root/proxy @@ -1203,8 +154,6 @@ jobs: git config user.email "paddle_ci@example.com" git config pull.rebase false pip install colorlog>=6.10.1 - pip uninstall paddlefleet -y - pip install colorlog>=6.10.1 pip install nvidia-cutlass-dsl==4.2.1 python -m pip install --pre paddlefleet --index-url https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/${CUDA_VERSION}/ --extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq @@ -1238,14 +187,14 @@ jobs: git config user.email "paddle_ci@example.com" git config pull.rebase false git log -1 + sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py + sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py pip install -e . --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ if [ "${use_release}" == "true" ]; then echo "Using pre-built paddle package from develop branch." pip install ${paddle_url} --index-url=https://www.paddlepaddle.org.cn/packages/nightly/${CUDA_VERSION}/ --force-reinstall --no-cache-dir fi - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt - pip install bce-python-sdk==0.8.74 - pip install coverage==7.6.1 + pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt matplotlib==3.10.8 pytest parameterized pip install librosa==0.11.0 echo "Paddle Commit" python -c "import paddle; print(paddle.version.commit)" @@ -1255,139 +204,26 @@ jobs: python -c "import paddleformers; print(paddleformers.version.commit)" ' - - name: GLM4.5 pre-train - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="glm45_pt_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 pre-train.\033[0m" - fi - ' - - - name: GLM4.5 sft - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="glm45_sft_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m" - fi - ' - - - name: GLM4.5 lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="glm45_lora_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" - fi - ' - - - name: GLM4.5 dpo - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="glm45_dpo_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" - fi - ' - - - name: GLM4.5 dpo_lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo_lora - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="glm45_dpo_lora_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo_lora.\033[0m" - fi - ' - - - name: Qwen pre-train - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh pt - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="qwen_pt_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen pre-train.\033[0m" - fi - ' - - - name: Qwen sft + - name: Proprocess for integration test if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' + docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy if [ "${PYTHON_VERSION}" != "3.10" ]; then source "$HOME/miniconda/etc/profile.d/conda.sh" conda activate python${PYTHON_VERSION} fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh sft - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="qwen_sft_multi_card_a100" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME + bash -x PaddleFormers/tests/integration_test/preprocess.sh + preprocess_exit_code=$? + if [[ "$preprocess_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mPreprocess failed.\033[0m" + exit 1 else - echo -e "\033[32mIntegration test succeeded: Qwen sft.\033[0m" + echo -e "\033[32mPreprocess succeeded.\033[0m" fi ' - - name: Qwen lora + - name: Integration test (GLM4.5 single-card) if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | docker exec -t ${{ env.container_name }} /bin/bash -c ' @@ -1396,17 +232,17 @@ jobs: source "$HOME/miniconda/etc/profile.d/conda.sh" conda activate python${PYTHON_VERSION} fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh lora - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="qwen_lora_multi_card_a100" + bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh + glm45_single_card_exit_code=$? + if [[ "$glm45_single_card_exit_code" != "0" ]]; then + export case_name="glm45_single_card" bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME else - echo -e "\033[32mIntegration test succeeded: Qwen lora.\033[0m" + echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m" fi ' - - name: Qwen vl sft + - name: Integration test (Qwen3-30B-A3B single-card) if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | docker exec -t ${{ env.container_name }} /bin/bash -c ' @@ -1415,53 +251,36 @@ jobs: source "$HOME/miniconda/etc/profile.d/conda.sh" conda activate python${PYTHON_VERSION} fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh tp8 a100 - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="qwen3vl_sft_a100_tp8_multi_card" + bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh + qwen3_single_card_exit_code=$? + if [[ "$qwen3_single_card_exit_code" != "0" ]]; then + export case_name="qwen3_single_card" bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME else - echo -e "\033[32mIntegration test succeeded: Qwen vl sft.\033[0m" + echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m" fi ' - - name: Qwen vl lora + - name: Qwen3-vl-8k-single-card if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | docker exec -t ${{ env.container_name }} /bin/bash -c ' source /root/proxy + sleep 24h if [ "${PYTHON_VERSION}" != "3.10" ]; then source "$HOME/miniconda/etc/profile.d/conda.sh" conda activate python${PYTHON_VERSION} fi - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_lora.sh a100 + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft_single_card.sh single exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="qwen3vl_lora_a100_multi_card" + if [[ "$exit_code" != "0" ]]; then + export case_name="qwen3vl_sft_single_card" bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME else - echo -e "\033[32mIntegration test succeeded: Qwen vl lora.\033[0m" + echo -e "\033[32mIntegration test succeeded: Qwen3-vl-8k-single-card.\033[0m" fi ' - - name: Qwen vl moe - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -c ' - source /root/proxy - if [ "${PYTHON_VERSION}" != "3.10" ]; then - source "$HOME/miniconda/etc/profile.d/conda.sh" - conda activate python${PYTHON_VERSION} - fi - timeout 10m bash -x PaddleFormers/tests/integration_test/qwen3vl_sft.sh moe a100 - exit_code=$? - if [ ${exit_code} -ne 0 ]; then - export case_name="qwen3vl_sft_a100_moe_multi_card" - bash PaddleFleet/ci/check_ce_precision.sh $case_name $BASE_NAME - else - echo -e "\033[32mIntegration test succeeded: Qwen vl moe.\033[0m" - fi - ' - name: Terminate and delete the container if: ${{ always() }}