Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions scripts/vllm_dissag/apply_moriio_2pd_patches.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/bin/bash
# apply_moriio_2pd_patches.sh — Apply vLLM PR #39276 at container startup
# =============================================================================
# Downloads and applies the patch from vllm-project/vllm PR #39276 which adds:
# 1. engine_id collision fix (core.py, utils.py)
# 2. MoRIIOConnector multi-node DP fixes (moriio_connector.py, moriio_common.py)
# 3. MoRIIO robustness fixes (moriio_engine.py)
#
# Idempotent: already-applied patches are skipped via --forward flag.
# Once PR #39276 is merged upstream, this script becomes a no-op.
# =============================================================================

set -euo pipefail

PR_NUM=39276
PATCH_URL="https://github.com/vllm-project/vllm/pull/${PR_NUM}.patch"
PATCH_FILE="/tmp/vllm_pr_${PR_NUM}.patch"

Comment on lines +15 to +18
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This script downloads and applies the patch from a live GitHub PR URL at runtime. Because PR patch content can change over time (force-push/new commits) and requires outbound network access, startup becomes non-reproducible and can fail in restricted clusters. Consider vendoring the patch into the repo or pinning to a specific commit/tag patch URL (and optionally verifying a checksum) so the applied changes are deterministic.

Copilot uses AI. Check for mistakes.
# Locate the vLLM installation directory
VLLM_INSTALL_DIR=""
_PYTHON_VLLM_CANDIDATE="$(python3 -c "import vllm, os; print(os.path.dirname(vllm.__file__))" 2>/dev/null || true)"
for _candidate in \
/usr/local/lib/python3.12/dist-packages/vllm \
/usr/local/lib/python3.*/dist-packages/vllm; do
if [ -d "$_candidate" ]; then
VLLM_INSTALL_DIR="$_candidate"
break
fi
done

if [ -z "${VLLM_INSTALL_DIR}" ] && [ -n "${_PYTHON_VLLM_CANDIDATE}" ] && [ -d "${_PYTHON_VLLM_CANDIDATE}" ]; then
VLLM_INSTALL_DIR="${_PYTHON_VLLM_CANDIDATE}"
fi

if [ -z "${VLLM_INSTALL_DIR}" ]; then
echo "[PR#${PR_NUM}] ERROR: Cannot find vLLM installation directory"
exit 1
fi

# The egg-info / dist-info root is one level up from the vllm package
VLLM_ROOT="$(dirname "${VLLM_INSTALL_DIR}")"
echo "[PR#${PR_NUM}] vLLM root: ${VLLM_ROOT}"

# Download the patch
echo "[PR#${PR_NUM}] Downloading patch from ${PATCH_URL}..."
if ! curl -sL "${PATCH_URL}" -o "${PATCH_FILE}" 2>/dev/null; then
echo "[PR#${PR_NUM}] ERROR: Failed to download patch — check network connectivity"
exit 1
fi

# Verify we got a real patch file (not an HTML error page)
if ! head -1 "${PATCH_FILE}" | grep -q "^From "; then
echo "[PR#${PR_NUM}] ERROR: Downloaded file is not a valid patch"
echo "[PR#${PR_NUM}] First line: $(head -1 "${PATCH_FILE}")"
rm -f "${PATCH_FILE}"
exit 1
fi

PATCH_LINES=$(wc -l < "${PATCH_FILE}")
echo "[PR#${PR_NUM}] Downloaded patch: ${PATCH_LINES} lines"

# Apply the patch
# --forward: skip already-applied hunks (idempotent)
# --reject-file=-: don't create .rej files
# -p1 strips the first path component (a/vllm/... -> vllm/...)
echo "[PR#${PR_NUM}] Applying patch to ${VLLM_ROOT}..."
cd "${VLLM_ROOT}"

if patch -p1 --forward --reject-file=- < "${PATCH_FILE}" 2>&1; then
echo "[PR#${PR_NUM}] Patch applied successfully"
elif [ $? -eq 1 ]; then
echo "[PR#${PR_NUM}] Patch already applied or partially applied (some hunks skipped)"
else
echo "[PR#${PR_NUM}] WARNING: Patch application had errors — some fixes may not be active"
fi

# Verify key files were patched by checking for known fix markers
echo "[PR#${PR_NUM}] Verifying patches..."
_ok=0
_total=0

_check_patch() {
local file="$1"
local marker="$2"
local desc="$3"
_total=$((_total + 1))
if [ -f "${VLLM_INSTALL_DIR}/${file}" ] && grep -q "${marker}" "${VLLM_INSTALL_DIR}/${file}" 2>/dev/null; then
echo " ✓ ${desc}"
_ok=$((_ok + 1))
else
echo " ✗ ${desc} — marker '${marker}' not found in ${file}"
fi
}

_check_patch "v1/engine/core.py" "dp_rank" "engine_id collision fix"
_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py" "data_parallel_size_local" "multi-node DP sizing"
_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py" "_req_kv_params" "kv_transfer_params caching"
_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py" "_is_kv_master" "child node guard"
_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py" "VLLM_MORIIO_TRANSFER_TIMEOUT_S" "transfer timeout"

echo "[PR#${PR_NUM}] Verification: ${_ok}/${_total} checks passed"

rm -f "${PATCH_FILE}"

if [ "${_ok}" -ne "${_total}" ]; then
echo "[PR#${PR_NUM}] ERROR: Patch verification failed — refusing to continue with partial patches"
exit 1
fi

echo "[PR#${PR_NUM}] Done"
73 changes: 66 additions & 7 deletions scripts/vllm_dissag/benchmark_xPyD.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,24 @@ echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" | tee -a ${LOG}_CONCURRE
echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" | tee -a ${LOG}_CONCURRENCY.log >/dev/null

sleep 10
echo "Warmup run:" | tee -a ${LOG}_CONCURRENCY.log >/dev/null
WARMUP_CON="${WARMUP_CON:-1}"
WARMUP_PROMPTS="${WARMUP_PROMPTS:-16}"
WARMUP_ISL="${WARMUP_ISL:-32}"
WARMUP_OSL="${WARMUP_OSL:-32}"
echo "Warmup run: ${WARMUP_PROMPTS} prompts @ con=${WARMUP_CON} isl=${WARMUP_ISL} osl=${WARMUP_OSL}" | tee -a ${LOG}_CONCURRENCY.log >/dev/null
vllm bench serve \
--model $MODEL_PATH \
--backend vllm \
--host 127.0.0.1 \
--port $BENCHMARK_PORT \
--dataset-name "random" \
--random-input-len 1024 \
--random-output-len 1024 \
--random-input-len $WARMUP_ISL \
--random-output-len $WARMUP_OSL \
--random-prefix-len 0 \
--num-prompts 16 \
--num-prompts $WARMUP_PROMPTS \
--request-rate "inf" \
--ignore-eos \
--max-concurrency 16 \
--max-concurrency $WARMUP_CON \
2>&1 | tee -a ${LOG}_CONCURRENCY.log >/dev/null
echo ""

Expand All @@ -41,8 +45,14 @@ for i in $(seq 1 $BENCHMARK_ITR); do
if [ "$p_con" -lt 16 ]; then
p_con=16
fi
echo "[RUNNING] prompts $p_con isl $isl osl $osl con $con"
vllm bench serve \
_base_timeout="${STEP_TIMEOUT:-1800}"
_total_tok=$(( isl + osl ))
_scaled_timeout=$(( _base_timeout * _total_tok / 2048 ))
if [ "$_scaled_timeout" -lt "$_base_timeout" ]; then
_scaled_timeout=$_base_timeout
fi
echo "[RUNNING] prompts $p_con isl $isl osl $osl con $con (timeout ${_scaled_timeout}s)"
timeout $_scaled_timeout vllm bench serve \
--model $MODEL_PATH \
--backend vllm \
--host 127.0.0.1 \
Expand All @@ -56,8 +66,57 @@ for i in $(seq 1 $BENCHMARK_ITR); do
--ignore-eos \
--max-concurrency $con \
2>&1 | tee -a ${LOG}_CONCURRENCY.log >/dev/null
rc=${PIPESTATUS[0]}
if [ $rc -eq 124 ]; then
echo "[STALL] isl=$isl osl=$osl con=$con timed out after ${_scaled_timeout}s" \
| tee -a ${LOG}_CONCURRENCY.log ${LOG}_STALLS.log >/dev/null
fi

sleep 10
done
done
done

if [[ "${RUN_PROFILE:-0}" == "1" ]]; then
PROFILE_PORT="${PROFILE_PORT:-${SERVE_PORT:-${SERVER_PORT:-20005}}}"
DECODE_IP="${DECODE_MASTER_ADDR:-${DECODE_MASTER_IP:-}}"

echo "==== Starting Profiling Phase ====" | tee -a ${LOG}_PROFILE.log
echo "Profile port: ${PROFILE_PORT} Benchmark port: ${BENCHMARK_PORT}" | tee -a ${LOG}_PROFILE.log
echo "Prefill master: 127.0.0.1:${PROFILE_PORT}" | tee -a ${LOG}_PROFILE.log
echo "Decode master: ${DECODE_IP}:${PROFILE_PORT}" | tee -a ${LOG}_PROFILE.log

echo "--- start_profile on prefill master ---" | tee -a ${LOG}_PROFILE.log
curl -s -X POST http://127.0.0.1:${PROFILE_PORT}/start_profile 2>&1 | tee -a ${LOG}_PROFILE.log
echo "" | tee -a ${LOG}_PROFILE.log

if [[ -n "${DECODE_IP}" ]]; then
echo "--- start_profile on decode master ---" | tee -a ${LOG}_PROFILE.log
curl -s -X POST http://${DECODE_IP}:${PROFILE_PORT}/start_profile 2>&1 | tee -a ${LOG}_PROFILE.log
echo "" | tee -a ${LOG}_PROFILE.log
fi

PROMPT=$(python3 -c "print('Hello ' * 170)")
echo "--- Sending inference request via proxy port ${BENCHMARK_PORT} ---" | tee -a ${LOG}_PROFILE.log
curl -s -X POST http://127.0.0.1:${BENCHMARK_PORT}/v1/completions \
-H "Content-Type: application/json" \
-d "{\"model\":\"${MODEL_PATH}\",\"prompt\":\"${PROMPT}\",\"max_tokens\":1024,\"ignore_eos\":true}" \
2>&1 | tee -a ${LOG}_PROFILE.log
echo "" | tee -a ${LOG}_PROFILE.log

sleep 5

echo "--- stop_profile on prefill master ---" | tee -a ${LOG}_PROFILE.log
curl -s -X POST http://127.0.0.1:${PROFILE_PORT}/stop_profile 2>&1 | tee -a ${LOG}_PROFILE.log
echo "" | tee -a ${LOG}_PROFILE.log

if [[ -n "${DECODE_IP}" ]]; then
echo "--- stop_profile on decode master ---" | tee -a ${LOG}_PROFILE.log
curl -s -X POST http://${DECODE_IP}:${PROFILE_PORT}/stop_profile 2>&1 | tee -a ${LOG}_PROFILE.log
echo "" | tee -a ${LOG}_PROFILE.log
fi

echo "--- Collecting profile traces ---" | tee -a ${LOG}_PROFILE.log
find /run_logs/${SLURM_JOB_ID}/profiles/ -type f 2>/dev/null | tee -a ${LOG}_PROFILE.log
echo "==== Profiling Phase Complete ===="
fi
65 changes: 51 additions & 14 deletions scripts/vllm_dissag/run_xPyD_models.slurm
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/bin/bash
#SBATCH --job-name=vllm-pd # Specify a custom string for your slurm batch job
#SBATCH -N 2 # Request xP + yD nodes (proxy co-located on prefill master)
#SBATCH -n 2 # Request xP + yD total tasks
#SBATCH --ntasks-per-node=1
#SBATCH --spread-job
#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed)
Expand Down Expand Up @@ -122,10 +120,6 @@ if [[ "$_run_mori" == "1" && "$_run_deepep" == "1" ]]; then
fi

if [[ "$_run_mori" == "1" ]]; then
if (( xP != 1 || yD != 1 )); then
echo "Error: RUN_MORI=1 requires xP=1 and yD=1 (got xP=$xP, yD=$yD). Multi-node MoRI EP is not yet supported." >&2
exit 1
fi
if model_allows_mori_ep "$MODEL_NAME"; then
RUN_FILE="vllm_disagg_mori_ep.sh"
echo "RUN_MORI=1: using $RUN_FILE for model '$MODEL_NAME'"
Expand Down Expand Up @@ -387,18 +381,26 @@ fuser -k 15000/tcp 2>/dev/null || true;
sleep 2;
docker pull $DOCKER_IMAGE_NAME 2>/dev/null || true;

# --- Create host-local compilation cache dirs (ext4, survives container restarts) ---
mkdir -p /tmp/vllm_cache/{aiter_jit,triton,vllm,comgr} 2>/dev/null || true;

# --- Build host RDMA library mounts ---
# Mount the host MLNX OFED userspace libraries into the container so that
# libmlx5 / libibverbs always match the host kernel module, preventing
# mlx5dv_devx_alloc_uar failures from ABI mismatches.
_RDMA_MOUNTS=""
_LIBDIR=/usr/lib/x86_64-linux-gnu
for _lib in libmlx5.so libmlx5.so.1 libibverbs.so libibverbs.so.1 librdmacm.so librdmacm.so.1; do

for _lib in libibverbs.so libibverbs.so.1 librdmacm.so librdmacm.so.1; do
[ -e "$_LIBDIR/$_lib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_LIBDIR/$_lib:$_LIBDIR/$_lib:ro"
done
for _vlib in $_LIBDIR/libmlx5.so.1.* $_LIBDIR/libibverbs.so.1.* $_LIBDIR/librdmacm.so.1.*; do
for _vlib in $_LIBDIR/libibverbs.so.1.* $_LIBDIR/librdmacm.so.1.*; do
[ -e "$_vlib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_vlib:$_vlib:ro"
done

for _pattern in libmlx5.so* libionic*.so* libbnxt_re*.so* libefa.so* libhns.so*; do
for _vlib in $_LIBDIR/${_pattern}; do
[ -e "$_vlib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_vlib:$_vlib:ro"
done
done

[ -d "$_LIBDIR/libibverbs" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_LIBDIR/libibverbs:$_LIBDIR/libibverbs:ro"
[ -d /etc/libibverbs.d ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v /etc/libibverbs.d:/etc/libibverbs.d:ro"
echo "[host-rdma] mounts: $_RDMA_MOUNTS"
Expand All @@ -419,8 +421,10 @@ docker run --rm \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--ulimit nofile=524288:524288 \
--ulimit memlock=-1:-1 \
-v ${LOG_PATH}:/run_logs \
-v $NIXL_REPO_DIR:$NIXL_COOKBOOK_PATH \
-v /tmp/vllm_cache:/tmp/vllm_cache \
$_RDMA_MOUNTS \
--entrypoint /bin/bash \
-e SLURM_JOB_ID=$SLURM_JOB_ID \
Expand All @@ -438,15 +442,48 @@ docker run --rm \
-e BENCHMARK_ITR=$BENCHMARK_ITR \
-e BENCHMARK_CON="${BENCHMARK_CON}" \
-e BENCHMARK_COMBINATIONS="${BENCHMARK_COMBINATIONS}" \
-e BENCHMARK_PORT=${BENCHMARK_PORT:-} \
-e PROXY_TYPE=${PROXY_TYPE:-} \
-e ROUTER_PORT=${ROUTER_PORT:-} \
-e IPADDRS=$IPADDRS \
-e PROXY_TYPE=${PROXY_TYPE:-vllm_router} \
-e ROUTER_PORT=${ROUTER_PORT:-18001} \
-e BENCHMARK_PORT=${BENCHMARK_PORT:-18001} \
-e PREFILL_DEEPEP_BACKEND=${PREFILL_DEEPEP_BACKEND:-} \
-e DECODE_DEEPEP_BACKEND=${DECODE_DEEPEP_BACKEND:-} \
-e ENABLE_DBO=${ENABLE_DBO:-} \
-e DBO_COMM_SMS=${DBO_COMM_SMS:-} \
-e ENABLE_PROFILING=${ENABLE_PROFILING:-} \
-e NCCL_IB_HCA=${NCCL_IB_HCA:-} \
-e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX:-} \
-e NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-} \
-e NCCL_CROSS_NIC=${NCCL_CROSS_NIC:-} \
-e NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-} \
-e GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-} \
-e MORI_SOCKET_IFNAME=${MORI_SOCKET_IFNAME:-eth0} \
-e MORI_IB_GID_INDEX=${MORI_IB_GID_INDEX:-} \
${MORI_RDMA_DEVICES:+-e MORI_RDMA_DEVICES=${MORI_RDMA_DEVICES}} \
${MORI_NUM_QP_PER_PE:+-e MORI_NUM_QP_PER_PE=${MORI_NUM_QP_PER_PE}} \
${VLLM_MORIIO_QP_PER_TRANSFER:+-e VLLM_MORIIO_QP_PER_TRANSFER=${VLLM_MORIIO_QP_PER_TRANSFER}} \
${VLLM_MORIIO_NUM_WORKERS:+-e VLLM_MORIIO_NUM_WORKERS=${VLLM_MORIIO_NUM_WORKERS}} \
-e RUN_PROFILE=${RUN_PROFILE:-0} \
-e PROFILE_PORT=${PROFILE_PORT:-} \
-e GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.8} \
-e GPUS_PER_NODE=${GPUS_PER_NODE:-8} \
-e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-} \
-e HIP_FORCE_DEV_KERNARG=${HIP_FORCE_DEV_KERNARG:-} \
-e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-} \
-e VLLM_HANDSHAKE_TIMEOUT_MINS=${VLLM_HANDSHAKE_TIMEOUT_MINS:-} \
-e VLLM_ENGINE_READY_TIMEOUT_S=${VLLM_ENGINE_READY_TIMEOUT_S:-} \
-e ROCSHMEM_HEAP_SIZE=${ROCSHMEM_HEAP_SIZE:-} \
-e ROCSHMEM_MAX_NUM_CONTEXTS=${ROCSHMEM_MAX_NUM_CONTEXTS:-} \
-e LOG_WAIT_TIMEOUT_SECONDS=${LOG_WAIT_TIMEOUT_SECONDS:-} \
-e TRITON_CACHE_DIR=${TRITON_CACHE_DIR:-/tmp/vllm_cache/triton} \
-e VLLM_CACHE_ROOT=${VLLM_CACHE_ROOT:-/tmp/vllm_cache/vllm} \
-e COMGR_CACHE_DIR=${COMGR_CACHE_DIR:-/tmp/vllm_cache/comgr} \
-e AITER_JIT_DIR=${AITER_JIT_DIR:-/tmp/vllm_cache/aiter_jit} \
-e DISTRIBUTED_TIMEOUT_SECONDS=${DISTRIBUTED_TIMEOUT_SECONDS:-7200} \
-e VLLM_RPC_TIMEOUT=${VLLM_RPC_TIMEOUT:-300000} \
-e VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=${VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS:-3600} \
-e VLLM_CUDAGRAPH_MODE=${VLLM_CUDAGRAPH_MODE:-} \
-e CUDAGRAPH_CAPTURE_SIZES="${CUDAGRAPH_CAPTURE_SIZES:-}" \
--name $DOCKER_CONT_NAME \
$DOCKER_IMAGE_NAME -c "
mkdir -p /run_logs/${SLURM_JOB_ID}
Expand Down
Loading