[feat] add fsdp2 to fsdp_sft_trainer (#1713) #3140
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: vllm | |
on: | |
# Trigger the workflow on push or pull request, | |
# but only for the main branch | |
push: | |
branches: | |
- main | |
- v0.* | |
pull_request: | |
branches: | |
- main | |
- v0.* | |
paths: | |
- "**/*.py" | |
# Other entrypoints | |
- "!examples/**" | |
- "!tests/**" | |
- "!verl/trainer/main_*.py" | |
- "!verl/trainer/fsdp_sft_trainer.py" | |
# Recipes | |
- "!recipe/**" | |
# FSDP | |
- "!verl/workers/**/*dp_*.py" | |
# Megatron | |
- "!verl/workers/**/megatron_*.py" | |
# SGLang | |
- "!**/*sglang*" | |
# Entrypoints | |
- ".github/workflows/vllm.yml" | |
- "tests/e2e/generation" | |
- "tests/rollout" | |
- "verl/trainer/main_generation.py" | |
- "verl/trainer/config/generation.yaml" | |
# Cancel jobs on the same ref if a new one is triggered | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
# Declare permissions just read content. | |
permissions: | |
contents: read | |
jobs: | |
vllm: | |
runs-on: [L20x8] | |
timeout-minutes: 60 # Increase this timeout value as needed | |
env: | |
HTTP_PROXY: ${{ secrets.PROXY_HTTP }} | |
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} | |
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" | |
HF_ENDPOINT: "https://hf-mirror.com" | |
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable | |
container: | |
image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6 | |
options: --gpus all --shm-size=10g | |
steps: | |
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
fetch-depth: 0 | |
- name: Install the current repository | |
run: | | |
pip3 install -e .[test] | |
pip3 install vllm==0.5.4 | |
- name: Download Model to Use | |
run: | | |
huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct | |
huggingface-cli download 'Qwen/Qwen2-7B-Instruct' | |
huggingface-cli download 'deepseek-ai/deepseek-llm-7b-chat' | |
export HF_HUB_OFFLINE=1 | |
# Disable requests to avoid network errors | |
- name: Running vllm tests on 8 L20 GPUs | |
run: | | |
cd tests/workers/rollout | |
torchrun --standalone --nnodes=1 --nproc_per_node=8 $(which pytest) -s test_vllm_hf_loader.py | |
- name: Test the latest vLLM | |
run: | | |
pip3 install --upgrade vllm==0.7.3 | |
cd tests/workers/rollout | |
torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py | |
- name: Run Qwen 0.5B generation test | |
run: | | |
cd tests/e2e/generation | |
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet" | |
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh | |
rm -rf "${OUTPUT_PATH}" | |
- name: Run Qwen 0.5B generation test when world_size == 1 | |
run: | | |
cd tests/e2e/generation | |
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet" | |
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh | |
rm -rf "${OUTPUT_PATH}" | |
- name: Running multi-turn rollout tests on 8 L20 GPUs | |
run: | | |
pip3 install --upgrade vllm==0.8.3 tensordict==0.7.2 | |
python3 tests/workers/rollout/test_vllm_multi_turn.py |