diff --git a/2.projects/vllm-ep/.gitignore b/2.projects/vllm-ep/.gitignore new file mode 100644 index 0000000..75c7009 --- /dev/null +++ b/2.projects/vllm-ep/.gitignore @@ -0,0 +1,2 @@ +nvshmem_src* +*.sqsh \ No newline at end of file diff --git a/2.projects/vllm-ep/README.md b/2.projects/vllm-ep/README.md new file mode 100644 index 0000000..7a80a94 --- /dev/null +++ b/2.projects/vllm-ep/README.md @@ -0,0 +1,89 @@ +# vLLM Expert Parallel Deployment + +https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html + +1. Download NVSHMEM +```bash +wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz +``` +2. Set environment variables +```bash +GDRCOPY_VERSION=v2.5.1 +EFA_INSTALLER_VERSION=1.43.2 +AWS_OFI_NCCL_VERSION=v1.16.3 +NCCL_VERSION=v2.27.7-1 +NCCL_TESTS_VERSION=v2.16.9 +NVSHMEM_VERSION=3.3.9 +PPLX_KERNELS_COMMIT=12cecfda252e4e646417ac263d96e994d476ee5d +DEEPGEMM_COMMIT=ea9c5d9270226c5dd7a577c212e9ea385f6ef048 +DEEPEP_COMMIT=c18eabdebf1381978ff884d278f6083a6153be3f +TORCH_VERSION=2.7.1 +VLLM_VERSION=0.10.1.1 +TAG="vllm${VLLM_VERSION}-efa${EFA_INSTALLER_VERSION}-ofi${AWS_OFI_NCCL_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}" +VLLM_EP_CONTAINER_IMAGE_NAME_TAG="vllm-ep:${TAG}" +``` +3. Build the container image +```bash +docker build --progress=plain -f ./vllm-ep.Dockerfile \ + --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \ + --build-arg="AWS_OFI_NCCL_VERSION=${AWS_OFI_NCCL_VERSION}" \ + --build-arg="NCCL_VERSION=${NCCL_VERSION}" \ + --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \ + --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \ + --build-arg="PPLX_KERNELS_COMMIT=${PPLX_KERNELS_COMMIT}" \ + --build-arg="DEEPGEMM_COMMIT=${DEEPGEMM_COMMIT}" \ + --build-arg="DEEPEP_COMMIT=${DEEPEP_COMMIT}" \ + --build-arg="VLLM_VERSION=${VLLM_VERSION}" \ + --build-arg="TORCH_VERSION=${TORCH_VERSION}" \ + -t ${VLLM_EP_CONTAINER_IMAGE_NAME_TAG} \ + . +``` +4. [Optional] Convert the container image to a SquashFS file +```bash +enroot import -o ./vllm-ep.sqsh dockerd://${VLLM_EP_CONTAINER_IMAGE_NAME_TAG} +``` +5. Run the container on a single 8 GPU node +```bash +docker run --runtime nvidia --gpus all \ + -v "$HF_HOME":/root/.cache/huggingface \ + --env "HF_TOKEN=$HF_TOKEN" \ + -e VLLM_ALL2ALL_BACKEND=pplx \ + -e VLLM_USE_DEEP_GEMM=1 \ + -p 8000:8000 \ + --ipc=host \ + ${VLLM_EP_CONTAINER_IMAGE_NAME_TAG} \ + vllm serve deepseek-ai/DeepSeek-R1-0528 \ + --tensor-parallel-size 1 \ + --data-parallel-size 8 \ + --enable-expert-parallel +``` +6. Expected logs: + +FlashInfer Backend: +``` +[topk_topp_sampler.py:50] Using FlashInfer for top-p & top-k sampling. +``` + +DeepGEMM Backend: +``` +[fp8.py:512] Using DeepGemm kernels for Fp8MoEMethod. +``` + +PPLX Backend: +``` +[cuda_communicator.py:81] Using PPLX all2all manager. +``` +otherwise: +``` +[cuda_communicator.py:77] Using naive all2all manager. +``` +7. Benchmark +``` +vllm bench serve \ + --model deepseek-ai/DeepSeek-R1-0528 \ + --dataset-name random \ + --random-input-len 128 \ + --random-output-len 128 \ + --num-prompts 10000 \ + --ignore-eos +``` diff --git a/2.projects/vllm-ep/vllm-ep.Dockerfile b/2.projects/vllm-ep/vllm-ep.Dockerfile new file mode 100644 index 0000000..83a47df --- /dev/null +++ b/2.projects/vllm-ep/vllm-ep.Dockerfile @@ -0,0 +1,273 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +ARG CUDA_VERSION=12.8.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + +################################ NCCL ######################################## + +ARG GDRCOPY_VERSION=v2.4.4 +ARG EFA_INSTALLER_VERSION=1.42.0 +ARG AWS_OFI_NCCL_VERSION=v1.16.0 +ARG NCCL_VERSION=v2.27.5-1 +ARG NCCL_TESTS_VERSION=v2.16.4 + +RUN apt-get update -y && apt-get upgrade -y +RUN apt-get remove -y --allow-change-held-packages \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + libmlx5-1 \ + libnccl2 \ + libnccl-dev + +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +ENV OPAL_PREFIX= + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + apt-utils \ + autoconf \ + automake \ + build-essential \ + check \ + cmake \ + curl \ + debhelper \ + devscripts \ + git \ + gcc \ + gdb \ + kmod \ + libsubunit-dev \ + libtool \ + openssh-client \ + openssh-server \ + pkg-config \ + python3-distutils \ + vim \ + python3.10-dev \ + python3.10-venv +RUN apt-get purge -y cuda-compat-* + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH + +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ + && python3 /tmp/get-pip.py \ + && pip3 install awscli pynvml + +################################################# +## Install NVIDIA GDRCopy +## +## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure +## that the cuda-compat-xx-x package is the latest. +RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ + && cd /tmp/gdrcopy \ + && make prefix=/opt/gdrcopy install + +ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH +ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH +ENV CPATH=/opt/gdrcopy/include:$CPATH +ENV PATH=/opt/gdrcopy/bin:$PATH + +################################################# +## Install EFA installer +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +################################################### +## Install NCCL +RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ + && cd /opt/nccl \ + && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +################################################### +## Install AWS-OFI-NCCL plugin +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev +#Switch from sh to bash to allow parameter expansion +SHELL ["/bin/bash", "-c"] +RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ + && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ + && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-mpi=/opt/amazon/openmpi \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + && make -j $(nproc) \ + && make install \ + && cd .. \ + && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ + && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz + +SHELL ["/bin/sh", "-c"] + +################################################### +## Install NCCL-tests +RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && make -j $(nproc) \ + MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda \ + NCCL_HOME=/opt/nccl/build \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +RUN rm -rf /var/lib/apt/lists/* + +## Set Open MPI variables to exclude network interface and conduit. +ENV OMPI_MCA_pml=^ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ + OPAL_PREFIX=/opt/amazon/openmpi \ + NCCL_SOCKET_IFNAME=^docker,lo,veth + +## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 +ENV PMIX_MCA_gds=hash + +## Set LD_PRELOAD for NCCL library +ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so + +################################ NVSHMEM ######################################## + +ENV NVSHMEM_DIR=/opt/nvshmem +ENV NVSHMEM_HOME=/opt/nvshmem + +# wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz +# or +# wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz +COPY ./nvshmem_src /nvshmem_src + +RUN cd /nvshmem_src \ + && mkdir -p build \ + && cd build \ + && cmake \ + -DNVSHMEM_PREFIX=/opt/nvshmem \ + -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \ + \ + -DCUDA_HOME=/usr/local/cuda \ + -DCMAKE_CUDA_ARCHITECTURES="90a;100" \ + \ + -DNVSHMEM_USE_GDRCOPY=1 \ + -DGDRCOPY_HOME=/opt/gdrcopy \ + \ + -DNVSHMEM_USE_NCCL=1 \ + -DNCCL_HOME=/opt/nccl/build \ + -DNCCL_INCLUDE=/opt/nccl/build/include \ + \ + -DNVSHMEM_LIBFABRIC_SUPPORT=1 \ + -DLIBFABRIC_HOME=/opt/amazon/efa \ + \ + -DNVSHMEM_MPI_SUPPORT=1 \ + -DMPI_HOME=/opt/amazon/openmpi \ + \ + -DNVSHMEM_PMIX_SUPPORT=1 \ + -DPMIX_HOME=/opt/amazon/pmix \ + -DNVSHMEM_DEFAULT_PMIX=1 \ + \ + -DNVSHMEM_BUILD_TESTS=1 \ + -DNVSHMEM_BUILD_EXAMPLES=1 \ + -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \ + -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \ + \ + -DNVSHMEM_IBRC_SUPPORT=1 \ + -DNVSHMEM_IBGDA_SUPPORT=1 \ + \ + -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + .. \ + && make -j$(nproc) \ + && make install + +ENV PATH=/opt/nvshmem/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH +# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa + +################################ Python ######################################## + +RUN command -v python >/dev/null 2>&1 || ln -s "$(command -v python3)" /usr/bin/python + +################################ venv ######################################## + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +################################ extra packages ######################################## + +RUN pip install ninja numpy cmake pytest blobfile + +################################ PyTorch ######################################## + +ARG TORCH_VERSION=2.7.1 + +RUN pip install torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu128 + +################################ vLLM ######################################## + +ARG VLLM_VERSION=0.10.1.1 + +# RUN pip install vllm==${VLLM_VERSION} + +RUN git clone https://github.com/vllm-project/vllm.git /vllm \ + && cd /vllm \ + && git checkout v${VLLM_VERSION} \ + && python use_existing_torch.py \ + && pip install -r requirements/build.txt \ + && pip install --no-build-isolation -e . + +################################ flashInfer and flash-attn ######################################## + +RUN pip install flashinfer-python + +RUN pip install flash-attn --no-build-isolation + +################################ PPLX-KERNELS ######################################## + +# see: https://github.com/vllm-project/vllm/tree/main/tools/ep_kernels +# see: https://github.com/pbelevich/pplx-kernels-benchmark + +ARG PPLX_KERNELS_COMMIT=12cecfda252e4e646417ac263d96e994d476ee5d + +RUN git clone https://github.com/ppl-ai/pplx-kernels.git /pplx-kernels \ + && cd /pplx-kernels \ + && git checkout ${PPLX_KERNELS_COMMIT} +# COPY pplx-kernels /pplx-kernels + +RUN cd /pplx-kernels \ + && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" python3 setup.py bdist_wheel \ + && pip install dist/*.whl + +ENV PYTHONPATH=/pplx-kernels + +################################ DeepGEMM ######################################## + +# see: https://github.com/deepseek-ai/DeepGEMM#installation + +ARG DEEPGEMM_COMMIT=ea9c5d9270226c5dd7a577c212e9ea385f6ef048 + +RUN git clone https://github.com/deepseek-ai/DeepGEMM.git /DeepGEMM \ + && cd /DeepGEMM \ + && git checkout ${DEEPGEMM_COMMIT} \ + && git submodule update --init --recursive \ + && ./install.sh + +################################ DeepEP ######################################## + +ARG DEEPEP_COMMIT=c18eabdebf1381978ff884d278f6083a6153be3f + +RUN git clone https://github.com/deepseek-ai/DeepEP.git /DeepEP \ + && cd /DeepEP \ + && git checkout ${DEEPEP_COMMIT} \ + && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" pip install . diff --git a/2.projects/vllm-ep/vllm-tests/README.md b/2.projects/vllm-ep/vllm-tests/README.md new file mode 100644 index 0000000..c9825ce --- /dev/null +++ b/2.projects/vllm-ep/vllm-tests/README.md @@ -0,0 +1,39 @@ +To test vLLM on GB200, we will use the instructions in [Expert Parallel Deployment (vLLM)](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html#single-node-deployment). +We will use the image defined [here](https://github.com/pbelevich/vllm-ep/blob/main/vllm-ep.Dockerfile). + +## Note +``` +EP=TP*DP +``` + +## Communication Backends for EP +vLLM supports three communication backends for EP: +image + +## Tests +For this section, we will benchmark: +1. pplx backend on a single node (1 tray) + 1. Model: deepseek-ai/deepseek-moe-16b-base +2. pplx backend on multiple nodes (1 rack) + 1. Model: deepseek-ai/DeepSeek-V3-0324 +3. pplx backend on multiple nodes (2 racks) + 1. Model: deepseek-ai/DeepSeek-V3-0324 +4. deepep_low_latency backend on multiple nodes (1 rack) + 1. Model: deepseek-ai/DeepSeek-V3-0324 + +We will not benchmark with the Expert Parallel load Balancer (EPLB). We will also not test Disaggregated Serving for this run. + + +To submit as sbatch, use the `single_node.sbatch` file in the parent directory: +``` +# Set HF_TOKEN and HF_HOME as env vars first. Then: +sbatch single_node.sbatch +``` + +Logs and results will be written to a `logs_single_node` directory + +## Takeaways +1. pplx as a backend doesn't work! This is being worked on it looks like: https://github.com/vllm-project/vllm/issues/24272, https://github.com/perplexityai/pplx-kernels/issues/36 +2. deepep also doesn't work! https://github.com/deepseek-ai/DeepEP/issues/392 +3. For vLLM -- use the native backend. +4. Also, recommend that you use NCCL, instead of NVSHMEM. There is no reason to use NVSHMEM. diff --git a/2.projects/vllm-ep/vllm-tests/ds-v3/ds-v3-vllm-2p6gb200-ep8.sbatch b/2.projects/vllm-ep/vllm-tests/ds-v3/ds-v3-vllm-2p6gb200-ep8.sbatch new file mode 100644 index 0000000..d7df09c --- /dev/null +++ b/2.projects/vllm-ep/vllm-tests/ds-v3/ds-v3-vllm-2p6gb200-ep8.sbatch @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=ds-vllm +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 + +set -x + +export MODEL_NAME=deepseek-ai/DeepSeek-V3-0324 +export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"` + +export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export PORT=13579 + +export VLLM_ALL2ALL_BACKEND=pplx +export VLLM_USE_DEEP_GEMM=1 + +srun -l \ + --mpi=pmix --cpu-bind=none \ + --container-image /fsx/ubuntu/belevich/vllm-ep/vllm-ep-nvshmem-aws-2.sqsh \ + --container-mounts=${HF_HOME}:${HF_HOME} \ + bash -c 'set -x; + [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))"; +vllm serve $MODEL_PATH \ + $EXTRA_FLAGS \ + --port 8000 \ + --served-model-name $MODEL_NAME \ + --trust-remote-code \ + --data-parallel-size 8 \ + --data-parallel-size-local 4 \ + --data-parallel-address $MASTER_IP \ + --data-parallel-rpc-port $PORT \ + --enable-expert-parallel diff --git a/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-16p6gb200-ep64.sbatch b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-16p6gb200-ep64.sbatch new file mode 100644 index 0000000..20ab3e0 --- /dev/null +++ b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-16p6gb200-ep64.sbatch @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=k2-vllm +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 + +set -x + +export MODEL_NAME=moonshotai/Kimi-K2-Instruct +export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"` + +export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export PORT=13579 + +srun -l \ + --mpi=pmix --cpu-bind=none \ + --container-image /fsx/ubuntu/belevich/vllm-ep/vllm-ep-nvshmem-aws-2.sqsh \ + --container-mounts=${HF_HOME}:${HF_HOME} \ + bash -c 'set -x; + [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))"; +vllm serve $MODEL_PATH \ + $EXTRA_FLAGS \ + --port 8000 \ + --served-model-name $MODEL_NAME \ + --trust-remote-code \ + --data-parallel-size 64 \ + --data-parallel-size-local 4 \ + --data-parallel-address $MASTER_IP \ + --data-parallel-rpc-port $PORT \ + --enable-expert-parallel \ + --max-num-batched-tokens 8192 \ + --max-num-seqs 256 \ + --gpu-memory-utilization 0.85 \ + --enable-auto-tool-choice \ + --tool-call-parser kimi_k2' diff --git a/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-4p6gb200-ep16.sbatch b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-4p6gb200-ep16.sbatch new file mode 100644 index 0000000..5fb208c --- /dev/null +++ b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-4p6gb200-ep16.sbatch @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=k2-vllm +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=1 + +set -x + +export MODEL_NAME=moonshotai/Kimi-K2-Instruct +export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"` + +export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export PORT=13579 + +srun -l \ + --mpi=pmix --cpu-bind=none \ + --container-image /fsx/ubuntu/belevich/vllm-ep/vllm-ep-nvshmem-aws-2.sqsh \ + --container-mounts=${HF_HOME}:${HF_HOME} \ + bash -c 'set -x; + [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))"; +vllm serve $MODEL_PATH \ + $EXTRA_FLAGS \ + --port 8000 \ + --served-model-name $MODEL_NAME \ + --trust-remote-code \ + --data-parallel-size 16 \ + --data-parallel-size-local 4 \ + --data-parallel-address $MASTER_IP \ + --data-parallel-rpc-port $PORT \ + --enable-expert-parallel \ + --max-num-batched-tokens 8192 \ + --max-num-seqs 256 \ + --gpu-memory-utilization 0.85 \ + --enable-auto-tool-choice \ + --tool-call-parser kimi_k2' diff --git a/2.projects/vllm-ep/vllm-tests/multi_node_1rack_pplx.sbatch b/2.projects/vllm-ep/vllm-tests/multi_node_1rack_pplx.sbatch new file mode 100644 index 0000000..0fe5c17 --- /dev/null +++ b/2.projects/vllm-ep/vllm-tests/multi_node_1rack_pplx.sbatch @@ -0,0 +1,129 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=vllm-ep-multi-node-1rack +#SBATCH --nodes=18 +#SBATCH --ntasks-per-node=1 +#SBATCH --output logs_multi_node_1rack/%x_%j.out +#SBATCH --error logs_multi_node_1rack/%x_%j.err +#SBATCH --exclusive=topo +#SBATCH --wait-all-nodes=1 + +### Disable hyperthreading by setting the tasks per core to 1 +#SBATCH --ntasks-per-core=1 + +set -x + +mkdir -p logs_multi_node_1rack + +########################### +###### User Variables ##### +########################### + +# default variables for Enroot +: "${APPS_PATH:=/fsx}" +: "${IMAGE:=$APPS_PATH/ubuntu/vLLM-testing/vllm-ep.sqsh}" +: "${HF_HOME:=/fsx/ubuntu/.cache/huggingface}" + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_EFA_FORK_SAFE=1 + +## Set vLLM Environment Variables +export VLLM_ALL2ALL_BACKEND=pplx +export VLLM_USE_DEEP_GEMM=1 +export VLLM_ENGINE_ITERATION_TIMEOUT_S=300 +export VLLM_RPC_TIMEOUT=60000 +export HF_TOKEN=$HF_TOKEN + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO + +# Model +# export MODEL="deepseek-ai/deepseek-moe-16b-base" +export MODEL_NAME="/fsx/ubuntu/.cache/huggingface/hub/models--deepseek-ai--deepseek-moe-16b-base/snapshots/521d2bc4fb69a3f3ae565310fcc3b65f97af2580" +export TP=1 +export DP_TOTAL=72 # Total DP size across all nodes +export DP_NODE=4 # Local DP size per node + +# Get primary node IP +export PRIMARY_NODE=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n1) +export PRIMARY_IP=$(srun --nodes=1 --ntasks=1 -w $PRIMARY_NODE hostname -I | awk '{print $1}') +export PORT=13345 +echo "Primary node: $PRIMARY_NODE ($PRIMARY_IP)" +echo "Total nodes: 18 (72 GPUs)" + +declare -a ARGS=( + --container-image $IMAGE + --container-mount-home + --container-mounts /fsx/ubuntu/vLLM-testing:/workspace + --container-mounts /dev/urandom:/dev/urandom + --container-mounts $(pwd)/logs_multi_node_1rack:/logs_multi_node_1rack + --container-env HF_TOKEN + --container-env VLLM_ALL2ALL_BACKEND + --container-env VLLM_USE_DEEP_GEMM + --container-env VLLM_ENGINE_ITERATION_TIMEOUT_S + --container-env VLLM_RPC_TIMEOUT + --container-env TP + --container-env DP_TOTAL + --container-env DP_NODE + --container-env MODEL_NAME + --container-env PRIMARY_IP + --container-env PORT + +) + +srun -l --mpi=pmix --cpu-bind=none "${ARGS[@]}" \ + bash -c 'set -x; + [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="--host 0.0.0.0 --port 8000 --api-server-count 4" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))"; + + # Start vLLM server + vllm serve $MODEL_NAME \ + $EXTRA_FLAGS \ + --trust-remote-code \ + --enable-expert-parallel \ + --data-parallel-size $DP_TOTAL \ + --data-parallel-size-local $DP_NODE \ + --data-parallel-address $PRIMARY_IP \ + --data-parallel-rpc-port $PORT & + + # Only run benchmarks on primary node + if [ "$SLURM_PROCID" -eq 0 ]; then + # Wait for server to be ready + for i in {1..600}; do + if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then + echo "Primary server responding, starting benchmarks..." + break + fi + sleep 5 + done + + # Run benchmarks + mkdir -p logs_multi_node_1rack + declare -a ISL_OSL_PAIRS=("128 128" "128 2048" "500 2000") + for pair in "${ISL_OSL_PAIRS[@]}"; do + read -r isl osl <<< "$pair" + echo "Testing ISL=$isl, OSL=$osl" + + vllm bench serve \ + --model $MODEL_NAME \ + --dataset-name random \ + --random-input-len $isl \ + --random-output-len $osl \ + --num-prompts 100 \ + --ignore-eos \ + --trust-remote-code \ + > /logs_multi_node_1rack/bench_${isl}_${osl}_${SLURM_JOB_ID}.log 2>&1 & + done + wait + else + # Worker nodes just wait + wait + fi + ' diff --git a/2.projects/vllm-ep/vllm-tests/single_node.sbatch b/2.projects/vllm-ep/vllm-tests/single_node.sbatch new file mode 100644 index 0000000..c5c2bf6 --- /dev/null +++ b/2.projects/vllm-ep/vllm-tests/single_node.sbatch @@ -0,0 +1,113 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=vllm-ep-single-node +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output logs_single_node/%x_%j.out +#SBATCH --error logs_single_node/%x_%j.err +#SBATCH --exclusive=topo +#SBATCH --wait-all-nodes=1 + +### Disable hyperthreading by setting the tasks per core to 1 +#SBATCH --ntasks-per-core=1 + +mkdir -p logs_single_node + +########################### +###### User Variables ##### +########################### + +# default variables for Enroot +: "${APPS_PATH:=/fsx}" +: "${IMAGE:=$APPS_PATH/ubuntu/vLLM-testing/vllm-ep.sqsh}" +: "${HF_HOME:=/fsx/ubuntu/.cache/huggingface}" + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_EFA_FORK_SAFE=1 + +## Set vLLM Environment Variables +export VLLM_ALL2ALL_BACKEND=pplx +export VLLM_USE_DEEP_GEMM=1 +export HF_TOKEN=$HF_TOKEN + +## Set this flag for debugging EFA +export FI_LOG_LEVEL=warn + +## NCCL Environment variables +export NCCL_DEBUG=INFO + +declare -a ARGS=( + --container-image $IMAGE + --container-mount-home + --container-mounts /fsx/ubuntu/vLLM-testing:/workspace + --container-env HF_TOKEN + --container-env VLLM_ALL2ALL_BACKEND + --container-env VLLM_USE_DEEP_GEMM +) + +# Start vLLM server and run benchmarks in single srun +echo "Starting vLLM server and benchmarks..." +srun "${ARGS[@]}" bash -c " + cd /workspace + + # ISL/OSL combinations to test + declare -a ISL_OSL_PAIRS=( + \"128 128\" + \"128 2048\" + \"128 4096\" + \"500 2000\" + \"1024 2048\" + \"2048 128\" + \"2048 2048\" + \"5000 500\" + \"20000 2000\" + ) + + # Model + export MODEL=\"deepseek-ai/deepseek-moe-16b-base\" + export TP=1 + export DP=4 + + # Start server in background + vllm serve \$MODEL \ + --trust-remote-code \ + --tensor-parallel-size \$TP \ + --data-parallel-size \$DP \ + --enable-expert-parallel \ + --host 0.0.0.0 \ + --port 8000 & + + SERVER_PID=\$! + # Wait for server + for i in {1..120}; do + if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then + echo 'Server ready' + break + fi + sleep 5 + done + + # Run benchmarks + for pair in \"\${ISL_OSL_PAIRS[@]}\"; do + read -r isl osl <<< \"\$pair\" + echo \"Testing ISL=\$isl, OSL=\$osl\" + + vllm bench serve \ + --model \$MODEL \ + --dataset-name random \ + --random-input-len \$isl \ + --random-output-len \$osl \ + --num-prompts 1000 \ + --ignore-eos \ + --trust-remote-code \ + > logs_single_node/results_\${isl}_\${osl}_${SLURM_JOB_ID}.txt 2>&1 + done + + # Cleanup + kill \$SERVER_PID +"