diff --git a/2.projects/vllm-ep/.gitignore b/2.projects/vllm-ep/.gitignore
new file mode 100644
index 0000000..75c7009
--- /dev/null
+++ b/2.projects/vllm-ep/.gitignore
@@ -0,0 +1,2 @@
+nvshmem_src*
+*.sqsh
\ No newline at end of file
diff --git a/2.projects/vllm-ep/README.md b/2.projects/vllm-ep/README.md
new file mode 100644
index 0000000..7a80a94
--- /dev/null
+++ b/2.projects/vllm-ep/README.md
@@ -0,0 +1,89 @@
+# vLLM Expert Parallel Deployment
+
+https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html
+
+1. Download NVSHMEM
+```bash
+wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+```
+2. Set environment variables
+```bash
+GDRCOPY_VERSION=v2.5.1
+EFA_INSTALLER_VERSION=1.43.2
+AWS_OFI_NCCL_VERSION=v1.16.3
+NCCL_VERSION=v2.27.7-1
+NCCL_TESTS_VERSION=v2.16.9
+NVSHMEM_VERSION=3.3.9
+PPLX_KERNELS_COMMIT=12cecfda252e4e646417ac263d96e994d476ee5d
+DEEPGEMM_COMMIT=ea9c5d9270226c5dd7a577c212e9ea385f6ef048
+DEEPEP_COMMIT=c18eabdebf1381978ff884d278f6083a6153be3f
+TORCH_VERSION=2.7.1
+VLLM_VERSION=0.10.1.1
+TAG="vllm${VLLM_VERSION}-efa${EFA_INSTALLER_VERSION}-ofi${AWS_OFI_NCCL_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}"
+VLLM_EP_CONTAINER_IMAGE_NAME_TAG="vllm-ep:${TAG}"
+```
+3. Build the container image
+```bash
+docker build --progress=plain -f ./vllm-ep.Dockerfile \
+       --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \
+       --build-arg="AWS_OFI_NCCL_VERSION=${AWS_OFI_NCCL_VERSION}" \
+       --build-arg="NCCL_VERSION=${NCCL_VERSION}" \
+       --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \
+       --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \
+       --build-arg="PPLX_KERNELS_COMMIT=${PPLX_KERNELS_COMMIT}" \
+       --build-arg="DEEPGEMM_COMMIT=${DEEPGEMM_COMMIT}" \
+       --build-arg="DEEPEP_COMMIT=${DEEPEP_COMMIT}" \
+       --build-arg="VLLM_VERSION=${VLLM_VERSION}" \
+       --build-arg="TORCH_VERSION=${TORCH_VERSION}" \
+       -t ${VLLM_EP_CONTAINER_IMAGE_NAME_TAG} \
+       .
+```
+4. [Optional] Convert the container image to a SquashFS file
+```bash
+enroot import -o ./vllm-ep.sqsh dockerd://${VLLM_EP_CONTAINER_IMAGE_NAME_TAG}
+```
+5. Run the container on a single 8 GPU node
+```bash
+docker run --runtime nvidia --gpus all \
+    -v "$HF_HOME":/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -e VLLM_ALL2ALL_BACKEND=pplx \
+    -e VLLM_USE_DEEP_GEMM=1 \
+    -p 8000:8000 \
+    --ipc=host \
+    ${VLLM_EP_CONTAINER_IMAGE_NAME_TAG} \
+    vllm serve deepseek-ai/DeepSeek-R1-0528 \
+    --tensor-parallel-size 1 \
+    --data-parallel-size 8 \
+    --enable-expert-parallel
+```
+6. Expected logs:
+
+FlashInfer Backend:
+```
+[topk_topp_sampler.py:50] Using FlashInfer for top-p & top-k sampling.
+```
+
+DeepGEMM Backend:
+```
+[fp8.py:512] Using DeepGemm kernels for Fp8MoEMethod.
+```
+
+PPLX Backend:
+```
+[cuda_communicator.py:81] Using PPLX all2all manager.
+```
+otherwise:
+```
+[cuda_communicator.py:77] Using naive all2all manager.
+```
+7. Benchmark
+```
+vllm bench serve \
+    --model deepseek-ai/DeepSeek-R1-0528 \
+    --dataset-name random \
+    --random-input-len 128 \
+    --random-output-len 128 \
+    --num-prompts 10000 \
+    --ignore-eos
+```
diff --git a/2.projects/vllm-ep/vllm-ep.Dockerfile b/2.projects/vllm-ep/vllm-ep.Dockerfile
new file mode 100644
index 0000000..83a47df
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-ep.Dockerfile
@@ -0,0 +1,273 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+################################ NCCL ########################################
+
+ARG GDRCOPY_VERSION=v2.4.4
+ARG EFA_INSTALLER_VERSION=1.42.0
+ARG AWS_OFI_NCCL_VERSION=v1.16.0
+ARG NCCL_VERSION=v2.27.5-1
+ARG NCCL_TESTS_VERSION=v2.16.4
+
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+ENV OPAL_PREFIX=
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
+    git \
+    gcc \
+    gdb \
+    kmod \
+    libsubunit-dev \
+    libtool \
+    openssh-client \
+    openssh-server \
+    pkg-config \
+    python3-distutils \
+    vim \
+    python3.10-dev \
+    python3.10-venv
+RUN apt-get purge -y cuda-compat-*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
+ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
+    && python3 /tmp/get-pip.py \
+    && pip3 install awscli pynvml
+
+#################################################
+## Install NVIDIA GDRCopy
+##
+## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
+## that the cuda-compat-xx-x package is the latest.
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
+ENV CPATH=/opt/gdrcopy/include:$CPATH
+ENV PATH=/opt/gdrcopy/bin:$PATH
+
+#################################################
+## Install EFA installer
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+###################################################
+## Install NCCL
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
+    && cd /opt/nccl \
+    && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+###################################################
+## Install AWS-OFI-NCCL plugin
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
+#Switch from sh to bash to allow parameter expansion
+SHELL ["/bin/bash", "-c"]
+RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
+    && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
+    && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
+    && ./configure --prefix=/opt/aws-ofi-nccl/install \
+        --with-mpi=/opt/amazon/openmpi \
+        --with-libfabric=/opt/amazon/efa \
+        --with-cuda=/usr/local/cuda \
+        --enable-platform-aws \
+    && make -j $(nproc) \
+    && make install \
+    && cd .. \
+    && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
+    && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
+
+SHELL ["/bin/sh", "-c"]
+
+###################################################
+## Install NCCL-tests
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) \
+    MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi/ \
+    CUDA_HOME=/usr/local/cuda \
+    NCCL_HOME=/opt/nccl/build \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
+ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so
+
+################################ NVSHMEM ########################################
+
+ENV NVSHMEM_DIR=/opt/nvshmem
+ENV NVSHMEM_HOME=/opt/nvshmem
+
+# wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz
+# or
+# wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+COPY ./nvshmem_src /nvshmem_src
+
+RUN cd /nvshmem_src \
+    && mkdir -p build \
+    && cd build \ 
+    && cmake \
+    -DNVSHMEM_PREFIX=/opt/nvshmem \
+    -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \
+    \
+    -DCUDA_HOME=/usr/local/cuda \
+    -DCMAKE_CUDA_ARCHITECTURES="90a;100" \
+    \
+    -DNVSHMEM_USE_GDRCOPY=1 \
+    -DGDRCOPY_HOME=/opt/gdrcopy \
+    \
+    -DNVSHMEM_USE_NCCL=1 \
+    -DNCCL_HOME=/opt/nccl/build \
+    -DNCCL_INCLUDE=/opt/nccl/build/include \
+    \
+    -DNVSHMEM_LIBFABRIC_SUPPORT=1 \
+    -DLIBFABRIC_HOME=/opt/amazon/efa \
+    \
+    -DNVSHMEM_MPI_SUPPORT=1 \
+    -DMPI_HOME=/opt/amazon/openmpi \
+    \
+    -DNVSHMEM_PMIX_SUPPORT=1 \
+    -DPMIX_HOME=/opt/amazon/pmix \
+    -DNVSHMEM_DEFAULT_PMIX=1 \
+    \
+    -DNVSHMEM_BUILD_TESTS=1 \
+    -DNVSHMEM_BUILD_EXAMPLES=1 \
+    -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \
+    -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \
+    \
+    -DNVSHMEM_IBRC_SUPPORT=1 \
+    -DNVSHMEM_IBGDA_SUPPORT=1 \
+    \
+    -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    .. \
+    && make -j$(nproc) \
+    && make install
+
+ENV PATH=/opt/nvshmem/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH
+# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa
+
+################################ Python ########################################
+
+RUN command -v python >/dev/null 2>&1 || ln -s "$(command -v python3)" /usr/bin/python
+
+################################ venv ########################################
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+################################ extra packages ########################################
+
+RUN pip install ninja numpy cmake pytest blobfile
+
+################################ PyTorch ########################################
+
+ARG TORCH_VERSION=2.7.1
+
+RUN pip install torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu128
+
+################################ vLLM ########################################
+
+ARG VLLM_VERSION=0.10.1.1
+
+# RUN pip install vllm==${VLLM_VERSION}
+
+RUN git clone https://github.com/vllm-project/vllm.git /vllm \
+    && cd /vllm \
+    && git checkout v${VLLM_VERSION} \
+    && python use_existing_torch.py \
+    && pip install -r requirements/build.txt \
+    && pip install --no-build-isolation -e .
+
+################################ flashInfer and flash-attn ########################################
+
+RUN pip install flashinfer-python
+
+RUN pip install flash-attn --no-build-isolation
+
+################################ PPLX-KERNELS ########################################
+
+# see: https://github.com/vllm-project/vllm/tree/main/tools/ep_kernels
+# see: https://github.com/pbelevich/pplx-kernels-benchmark
+
+ARG PPLX_KERNELS_COMMIT=12cecfda252e4e646417ac263d96e994d476ee5d
+
+RUN git clone https://github.com/ppl-ai/pplx-kernels.git /pplx-kernels \
+    && cd /pplx-kernels \
+    && git checkout ${PPLX_KERNELS_COMMIT}
+# COPY pplx-kernels /pplx-kernels
+
+RUN cd /pplx-kernels \
+    && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" python3 setup.py bdist_wheel \
+    && pip install dist/*.whl
+
+ENV PYTHONPATH=/pplx-kernels
+
+################################ DeepGEMM ########################################
+
+# see: https://github.com/deepseek-ai/DeepGEMM#installation
+
+ARG DEEPGEMM_COMMIT=ea9c5d9270226c5dd7a577c212e9ea385f6ef048
+
+RUN git clone https://github.com/deepseek-ai/DeepGEMM.git /DeepGEMM \
+    && cd /DeepGEMM \
+    && git checkout ${DEEPGEMM_COMMIT} \
+    && git submodule update --init --recursive \
+    && ./install.sh
+
+################################ DeepEP ########################################
+
+ARG DEEPEP_COMMIT=c18eabdebf1381978ff884d278f6083a6153be3f
+
+RUN git clone https://github.com/deepseek-ai/DeepEP.git /DeepEP \
+    && cd /DeepEP \
+    && git checkout ${DEEPEP_COMMIT} \
+    && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" pip install .
diff --git a/2.projects/vllm-ep/vllm-tests/README.md b/2.projects/vllm-ep/vllm-tests/README.md
new file mode 100644
index 0000000..c9825ce
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-tests/README.md
@@ -0,0 +1,39 @@
+To test vLLM on GB200, we will use the instructions in [Expert Parallel Deployment (vLLM)](https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment.html#single-node-deployment).
+We will use the image defined [here](https://github.com/pbelevich/vllm-ep/blob/main/vllm-ep.Dockerfile).
+
+## Note
+```
+EP=TP*DP
+```
+
+## Communication Backends for EP
+vLLM supports three communication backends for EP:
+<img width="764" height="274" alt="image" src="https://github.com/user-attachments/assets/7dc0a088-ce99-4ba0-96ff-1fba39bf6f02" />
+
+## Tests
+For this section, we will benchmark:
+1. pplx backend on a single node (1 tray)
+    1. Model: deepseek-ai/deepseek-moe-16b-base
+2. pplx backend on multiple nodes (1 rack)
+    1. Model: deepseek-ai/DeepSeek-V3-0324
+3. pplx backend on multiple nodes (2 racks)
+    1. Model: deepseek-ai/DeepSeek-V3-0324
+4. deepep_low_latency  backend on multiple nodes (1 rack)
+    1. Model: deepseek-ai/DeepSeek-V3-0324
+
+We will not benchmark with the Expert Parallel load Balancer (EPLB). We will also not test Disaggregated Serving for this run. 
+
+
+To submit as sbatch, use the `single_node.sbatch` file in the parent directory:
+```
+# Set HF_TOKEN and HF_HOME as env vars first. Then:
+sbatch single_node.sbatch
+```
+
+Logs and results will be written to a `logs_single_node` directory  
+
+## Takeaways
+1. pplx as a backend doesn't work! This is being worked on it looks like: https://github.com/vllm-project/vllm/issues/24272, https://github.com/perplexityai/pplx-kernels/issues/36
+2. deepep also doesn't work! https://github.com/deepseek-ai/DeepEP/issues/392
+3. For vLLM -- use the native backend.
+4. Also, recommend that you use NCCL, instead of NVSHMEM. There is no reason to use NVSHMEM.
diff --git a/2.projects/vllm-ep/vllm-tests/ds-v3/ds-v3-vllm-2p6gb200-ep8.sbatch b/2.projects/vllm-ep/vllm-tests/ds-v3/ds-v3-vllm-2p6gb200-ep8.sbatch
new file mode 100644
index 0000000..d7df09c
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-tests/ds-v3/ds-v3-vllm-2p6gb200-ep8.sbatch
@@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --job-name=ds-vllm
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+
+set -x
+
+export MODEL_NAME=deepseek-ai/DeepSeek-V3-0324
+export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"`
+
+export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export PORT=13579
+
+export VLLM_ALL2ALL_BACKEND=pplx 
+export VLLM_USE_DEEP_GEMM=1
+
+srun -l \
+    --mpi=pmix --cpu-bind=none \
+    --container-image /fsx/ubuntu/belevich/vllm-ep/vllm-ep-nvshmem-aws-2.sqsh \
+    --container-mounts=${HF_HOME}:${HF_HOME} \
+    bash -c 'set -x; 
+    [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))";
+vllm serve $MODEL_PATH \
+    $EXTRA_FLAGS \
+    --port 8000 \
+    --served-model-name $MODEL_NAME \
+    --trust-remote-code \
+    --data-parallel-size 8 \
+    --data-parallel-size-local 4 \
+    --data-parallel-address $MASTER_IP \
+    --data-parallel-rpc-port $PORT \
+    --enable-expert-parallel
diff --git a/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-16p6gb200-ep64.sbatch b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-16p6gb200-ep64.sbatch
new file mode 100644
index 0000000..20ab3e0
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-16p6gb200-ep64.sbatch
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --job-name=k2-vllm
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1
+
+set -x
+
+export MODEL_NAME=moonshotai/Kimi-K2-Instruct
+export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"`
+
+export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export PORT=13579
+
+srun -l \
+    --mpi=pmix --cpu-bind=none \
+    --container-image /fsx/ubuntu/belevich/vllm-ep/vllm-ep-nvshmem-aws-2.sqsh \
+    --container-mounts=${HF_HOME}:${HF_HOME} \
+    bash -c 'set -x; 
+    [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))";
+vllm serve $MODEL_PATH \
+    $EXTRA_FLAGS \
+    --port 8000 \
+    --served-model-name $MODEL_NAME \
+    --trust-remote-code \
+    --data-parallel-size 64 \
+    --data-parallel-size-local 4 \
+    --data-parallel-address $MASTER_IP \
+    --data-parallel-rpc-port $PORT \
+    --enable-expert-parallel \
+    --max-num-batched-tokens 8192 \
+    --max-num-seqs 256 \
+    --gpu-memory-utilization 0.85 \
+    --enable-auto-tool-choice \
+    --tool-call-parser kimi_k2'
diff --git a/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-4p6gb200-ep16.sbatch b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-4p6gb200-ep16.sbatch
new file mode 100644
index 0000000..5fb208c
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-tests/kimi-k2/kimi-k2-vllm-4p6gb200-ep16.sbatch
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --job-name=k2-vllm
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=1
+
+set -x
+
+export MODEL_NAME=moonshotai/Kimi-K2-Instruct
+export MODEL_PATH=`python -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('$MODEL_NAME', filename='config.json')).parent)"`
+
+export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export PORT=13579
+
+srun -l \
+    --mpi=pmix --cpu-bind=none \
+    --container-image /fsx/ubuntu/belevich/vllm-ep/vllm-ep-nvshmem-aws-2.sqsh \
+    --container-mounts=${HF_HOME}:${HF_HOME} \
+    bash -c 'set -x; 
+    [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))";
+vllm serve $MODEL_PATH \
+    $EXTRA_FLAGS \
+    --port 8000 \
+    --served-model-name $MODEL_NAME \
+    --trust-remote-code \
+    --data-parallel-size 16 \
+    --data-parallel-size-local 4 \
+    --data-parallel-address $MASTER_IP \
+    --data-parallel-rpc-port $PORT \
+    --enable-expert-parallel \
+    --max-num-batched-tokens 8192 \
+    --max-num-seqs 256 \
+    --gpu-memory-utilization 0.85 \
+    --enable-auto-tool-choice \
+    --tool-call-parser kimi_k2'
diff --git a/2.projects/vllm-ep/vllm-tests/multi_node_1rack_pplx.sbatch b/2.projects/vllm-ep/vllm-tests/multi_node_1rack_pplx.sbatch
new file mode 100644
index 0000000..0fe5c17
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-tests/multi_node_1rack_pplx.sbatch
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=vllm-ep-multi-node-1rack
+#SBATCH --nodes=18
+#SBATCH --ntasks-per-node=1 
+#SBATCH --output logs_multi_node_1rack/%x_%j.out
+#SBATCH --error logs_multi_node_1rack/%x_%j.err
+#SBATCH --exclusive=topo
+#SBATCH --wait-all-nodes=1
+
+### Disable hyperthreading by setting the tasks per core to 1
+#SBATCH --ntasks-per-core=1
+
+set -x
+
+mkdir -p logs_multi_node_1rack
+
+###########################
+###### User Variables #####
+###########################
+
+# default variables for Enroot
+: "${APPS_PATH:=/fsx}"
+: "${IMAGE:=$APPS_PATH/ubuntu/vLLM-testing/vllm-ep.sqsh}"
+: "${HF_HOME:=/fsx/ubuntu/.cache/huggingface}"
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 
+export FI_EFA_FORK_SAFE=1
+
+## Set vLLM Environment Variables
+export VLLM_ALL2ALL_BACKEND=pplx
+export VLLM_USE_DEEP_GEMM=1
+export VLLM_ENGINE_ITERATION_TIMEOUT_S=300
+export VLLM_RPC_TIMEOUT=60000
+export HF_TOKEN=$HF_TOKEN
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+
+# Model
+# export MODEL="deepseek-ai/deepseek-moe-16b-base"
+export MODEL_NAME="/fsx/ubuntu/.cache/huggingface/hub/models--deepseek-ai--deepseek-moe-16b-base/snapshots/521d2bc4fb69a3f3ae565310fcc3b65f97af2580"
+export TP=1
+export DP_TOTAL=72  # Total DP size across all nodes
+export DP_NODE=4    # Local DP size per node
+
+# Get primary node IP
+export PRIMARY_NODE=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n1)
+export PRIMARY_IP=$(srun --nodes=1 --ntasks=1 -w $PRIMARY_NODE hostname -I | awk '{print $1}')
+export PORT=13345
+echo "Primary node: $PRIMARY_NODE ($PRIMARY_IP)"
+echo "Total nodes: 18 (72 GPUs)"
+
+declare -a ARGS=(
+    --container-image $IMAGE
+    --container-mount-home
+    --container-mounts /fsx/ubuntu/vLLM-testing:/workspace
+    --container-mounts /dev/urandom:/dev/urandom
+    --container-mounts $(pwd)/logs_multi_node_1rack:/logs_multi_node_1rack 
+    --container-env HF_TOKEN
+    --container-env VLLM_ALL2ALL_BACKEND
+    --container-env VLLM_USE_DEEP_GEMM
+    --container-env VLLM_ENGINE_ITERATION_TIMEOUT_S
+    --container-env VLLM_RPC_TIMEOUT
+    --container-env TP
+    --container-env DP_TOTAL
+    --container-env DP_NODE
+    --container-env MODEL_NAME
+    --container-env PRIMARY_IP
+    --container-env PORT
+
+)
+
+srun -l --mpi=pmix --cpu-bind=none "${ARGS[@]}" \
+    bash -c 'set -x;
+    [ "$SLURM_PROCID" -eq 0 ] && EXTRA_FLAGS="--host 0.0.0.0 --port 8000 --api-server-count 4" || EXTRA_FLAGS="--headless --data-parallel-start-rank $((4 * SLURM_PROCID))";
+
+    # Start vLLM server
+    vllm serve $MODEL_NAME \
+        $EXTRA_FLAGS \
+        --trust-remote-code \
+        --enable-expert-parallel \
+        --data-parallel-size $DP_TOTAL \
+        --data-parallel-size-local $DP_NODE \
+        --data-parallel-address $PRIMARY_IP \
+        --data-parallel-rpc-port $PORT &
+    
+    # Only run benchmarks on primary node
+    if [ "$SLURM_PROCID" -eq 0 ]; then
+        # Wait for server to be ready
+        for i in {1..600}; do
+            if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then
+                echo "Primary server responding, starting benchmarks..."
+                break
+            fi
+            sleep 5
+        done
+    
+        # Run benchmarks
+        mkdir -p logs_multi_node_1rack
+        declare -a ISL_OSL_PAIRS=("128 128" "128 2048" "500 2000")
+        for pair in "${ISL_OSL_PAIRS[@]}"; do
+            read -r isl osl <<< "$pair"
+            echo "Testing ISL=$isl, OSL=$osl"
+
+            vllm bench serve \
+                --model $MODEL_NAME \
+                --dataset-name random \
+                --random-input-len $isl \
+                --random-output-len $osl \
+                --num-prompts 100 \
+                --ignore-eos \
+                --trust-remote-code \
+                > /logs_multi_node_1rack/bench_${isl}_${osl}_${SLURM_JOB_ID}.log 2>&1 &
+        done
+        wait
+    else
+        # Worker nodes just wait
+        wait
+    fi
+    '
diff --git a/2.projects/vllm-ep/vllm-tests/single_node.sbatch b/2.projects/vllm-ep/vllm-tests/single_node.sbatch
new file mode 100644
index 0000000..c5c2bf6
--- /dev/null
+++ b/2.projects/vllm-ep/vllm-tests/single_node.sbatch
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=vllm-ep-single-node
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 
+#SBATCH --output logs_single_node/%x_%j.out
+#SBATCH --error logs_single_node/%x_%j.err
+#SBATCH --exclusive=topo
+#SBATCH --wait-all-nodes=1
+
+### Disable hyperthreading by setting the tasks per core to 1
+#SBATCH --ntasks-per-core=1
+
+mkdir -p logs_single_node
+
+###########################
+###### User Variables #####
+###########################
+
+# default variables for Enroot
+: "${APPS_PATH:=/fsx}"
+: "${IMAGE:=$APPS_PATH/ubuntu/vLLM-testing/vllm-ep.sqsh}"
+: "${HF_HOME:=/fsx/ubuntu/.cache/huggingface}"
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 
+export FI_EFA_FORK_SAFE=1
+
+## Set vLLM Environment Variables
+export VLLM_ALL2ALL_BACKEND=pplx
+export VLLM_USE_DEEP_GEMM=1
+export HF_TOKEN=$HF_TOKEN
+
+## Set this flag for debugging EFA
+export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+export NCCL_DEBUG=INFO
+
+declare -a ARGS=(
+    --container-image $IMAGE
+    --container-mount-home
+    --container-mounts /fsx/ubuntu/vLLM-testing:/workspace
+    --container-env HF_TOKEN
+    --container-env VLLM_ALL2ALL_BACKEND
+    --container-env VLLM_USE_DEEP_GEMM
+)
+
+# Start vLLM server and run benchmarks in single srun
+echo "Starting vLLM server and benchmarks..."
+srun "${ARGS[@]}" bash -c "
+    cd /workspace
+
+    # ISL/OSL combinations to test
+    declare -a ISL_OSL_PAIRS=(
+        \"128 128\"
+        \"128 2048\"
+        \"128 4096\"
+        \"500 2000\"
+        \"1024 2048\"
+        \"2048 128\"
+        \"2048 2048\"
+        \"5000 500\"
+        \"20000 2000\"
+    )
+
+    # Model
+    export MODEL=\"deepseek-ai/deepseek-moe-16b-base\"
+    export TP=1
+    export DP=4
+
+    # Start server in background
+    vllm serve \$MODEL \
+        --trust-remote-code \
+        --tensor-parallel-size \$TP \
+        --data-parallel-size \$DP \
+        --enable-expert-parallel \
+        --host 0.0.0.0 \
+        --port 8000 &
+    
+    SERVER_PID=\$!
+    # Wait for server
+    for i in {1..120}; do
+        if curl -s http://localhost:8000/v1/models > /dev/null 2>&1; then
+            echo 'Server ready'
+            break
+        fi
+        sleep 5
+    done
+
+    # Run benchmarks
+    for pair in \"\${ISL_OSL_PAIRS[@]}\"; do
+        read -r isl osl <<< \"\$pair\"
+        echo \"Testing ISL=\$isl, OSL=\$osl\"
+        
+        vllm bench serve \
+            --model \$MODEL \
+            --dataset-name random \
+            --random-input-len \$isl \
+            --random-output-len \$osl \
+            --num-prompts 1000 \
+            --ignore-eos \
+            --trust-remote-code \
+            > logs_single_node/results_\${isl}_\${osl}_${SLURM_JOB_ID}.txt 2>&1
+    done
+    
+    # Cleanup
+    kill \$SERVER_PID
+"