diff --git a/scripts/vllm_dissag/apply_moriio_2pd_patches.sh b/scripts/vllm_dissag/apply_moriio_2pd_patches.sh new file mode 100755 index 0000000..4458989 --- /dev/null +++ b/scripts/vllm_dissag/apply_moriio_2pd_patches.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# apply_moriio_2pd_patches.sh — Apply vLLM PR #39276 at container startup +# ============================================================================= +# Downloads and applies the patch from vllm-project/vllm PR #39276 which adds: +# 1. engine_id collision fix (core.py, utils.py) +# 2. MoRIIOConnector multi-node DP fixes (moriio_connector.py, moriio_common.py) +# 3. MoRIIO robustness fixes (moriio_engine.py) +# +# Idempotent: already-applied patches are skipped via --forward flag. +# Once PR #39276 is merged upstream, this script becomes a no-op. +# ============================================================================= + +set -euo pipefail + +PR_NUM=39276 +PATCH_URL="https://github.com/vllm-project/vllm/pull/${PR_NUM}.patch" +PATCH_FILE="/tmp/vllm_pr_${PR_NUM}.patch" + +# Locate the vLLM installation directory +VLLM_INSTALL_DIR="" +_PYTHON_VLLM_CANDIDATE="$(python3 -c "import vllm, os; print(os.path.dirname(vllm.__file__))" 2>/dev/null || true)" +for _candidate in \ + /usr/local/lib/python3.12/dist-packages/vllm \ + /usr/local/lib/python3.*/dist-packages/vllm; do + if [ -d "$_candidate" ]; then + VLLM_INSTALL_DIR="$_candidate" + break + fi +done + +if [ -z "${VLLM_INSTALL_DIR}" ] && [ -n "${_PYTHON_VLLM_CANDIDATE}" ] && [ -d "${_PYTHON_VLLM_CANDIDATE}" ]; then + VLLM_INSTALL_DIR="${_PYTHON_VLLM_CANDIDATE}" +fi + +if [ -z "${VLLM_INSTALL_DIR}" ]; then + echo "[PR#${PR_NUM}] ERROR: Cannot find vLLM installation directory" + exit 1 +fi + +# The egg-info / dist-info root is one level up from the vllm package +VLLM_ROOT="$(dirname "${VLLM_INSTALL_DIR}")" +echo "[PR#${PR_NUM}] vLLM root: ${VLLM_ROOT}" + +# Download the patch +echo "[PR#${PR_NUM}] Downloading patch from ${PATCH_URL}..." +if ! curl -sL "${PATCH_URL}" -o "${PATCH_FILE}" 2>/dev/null; then + echo "[PR#${PR_NUM}] ERROR: Failed to download patch — check network connectivity" + exit 1 +fi + +# Verify we got a real patch file (not an HTML error page) +if ! head -1 "${PATCH_FILE}" | grep -q "^From "; then + echo "[PR#${PR_NUM}] ERROR: Downloaded file is not a valid patch" + echo "[PR#${PR_NUM}] First line: $(head -1 "${PATCH_FILE}")" + rm -f "${PATCH_FILE}" + exit 1 +fi + +PATCH_LINES=$(wc -l < "${PATCH_FILE}") +echo "[PR#${PR_NUM}] Downloaded patch: ${PATCH_LINES} lines" + +# Apply the patch +# --forward: skip already-applied hunks (idempotent) +# --reject-file=-: don't create .rej files +# -p1 strips the first path component (a/vllm/... -> vllm/...) +echo "[PR#${PR_NUM}] Applying patch to ${VLLM_ROOT}..." +cd "${VLLM_ROOT}" + +if patch -p1 --forward --reject-file=- < "${PATCH_FILE}" 2>&1; then + echo "[PR#${PR_NUM}] Patch applied successfully" +elif [ $? -eq 1 ]; then + echo "[PR#${PR_NUM}] Patch already applied or partially applied (some hunks skipped)" +else + echo "[PR#${PR_NUM}] WARNING: Patch application had errors — some fixes may not be active" +fi + +# Verify key files were patched by checking for known fix markers +echo "[PR#${PR_NUM}] Verifying patches..." +_ok=0 +_total=0 + +_check_patch() { + local file="$1" + local marker="$2" + local desc="$3" + _total=$((_total + 1)) + if [ -f "${VLLM_INSTALL_DIR}/${file}" ] && grep -q "${marker}" "${VLLM_INSTALL_DIR}/${file}" 2>/dev/null; then + echo " ✓ ${desc}" + _ok=$((_ok + 1)) + else + echo " ✗ ${desc} — marker '${marker}' not found in ${file}" + fi +} + +_check_patch "v1/engine/core.py" "dp_rank" "engine_id collision fix" +_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py" "data_parallel_size_local" "multi-node DP sizing" +_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py" "_req_kv_params" "kv_transfer_params caching" +_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py" "_is_kv_master" "child node guard" +_check_patch "distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py" "VLLM_MORIIO_TRANSFER_TIMEOUT_S" "transfer timeout" + +echo "[PR#${PR_NUM}] Verification: ${_ok}/${_total} checks passed" + +rm -f "${PATCH_FILE}" + +if [ "${_ok}" -ne "${_total}" ]; then + echo "[PR#${PR_NUM}] ERROR: Patch verification failed — refusing to continue with partial patches" + exit 1 +fi + +echo "[PR#${PR_NUM}] Done" diff --git a/scripts/vllm_dissag/benchmark_xPyD.sh b/scripts/vllm_dissag/benchmark_xPyD.sh index f1db467..57e0c58 100755 --- a/scripts/vllm_dissag/benchmark_xPyD.sh +++ b/scripts/vllm_dissag/benchmark_xPyD.sh @@ -11,20 +11,24 @@ echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" | tee -a ${LOG}_CONCURRE echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" | tee -a ${LOG}_CONCURRENCY.log >/dev/null sleep 10 -echo "Warmup run:" | tee -a ${LOG}_CONCURRENCY.log >/dev/null +WARMUP_CON="${WARMUP_CON:-1}" +WARMUP_PROMPTS="${WARMUP_PROMPTS:-16}" +WARMUP_ISL="${WARMUP_ISL:-32}" +WARMUP_OSL="${WARMUP_OSL:-32}" +echo "Warmup run: ${WARMUP_PROMPTS} prompts @ con=${WARMUP_CON} isl=${WARMUP_ISL} osl=${WARMUP_OSL}" | tee -a ${LOG}_CONCURRENCY.log >/dev/null vllm bench serve \ --model $MODEL_PATH \ --backend vllm \ --host 127.0.0.1 \ --port $BENCHMARK_PORT \ --dataset-name "random" \ - --random-input-len 1024 \ - --random-output-len 1024 \ + --random-input-len $WARMUP_ISL \ + --random-output-len $WARMUP_OSL \ --random-prefix-len 0 \ - --num-prompts 16 \ + --num-prompts $WARMUP_PROMPTS \ --request-rate "inf" \ --ignore-eos \ - --max-concurrency 16 \ + --max-concurrency $WARMUP_CON \ 2>&1 | tee -a ${LOG}_CONCURRENCY.log >/dev/null echo "" @@ -41,8 +45,14 @@ for i in $(seq 1 $BENCHMARK_ITR); do if [ "$p_con" -lt 16 ]; then p_con=16 fi - echo "[RUNNING] prompts $p_con isl $isl osl $osl con $con" - vllm bench serve \ + _base_timeout="${STEP_TIMEOUT:-1800}" + _total_tok=$(( isl + osl )) + _scaled_timeout=$(( _base_timeout * _total_tok / 2048 )) + if [ "$_scaled_timeout" -lt "$_base_timeout" ]; then + _scaled_timeout=$_base_timeout + fi + echo "[RUNNING] prompts $p_con isl $isl osl $osl con $con (timeout ${_scaled_timeout}s)" + timeout $_scaled_timeout vllm bench serve \ --model $MODEL_PATH \ --backend vllm \ --host 127.0.0.1 \ @@ -56,8 +66,57 @@ for i in $(seq 1 $BENCHMARK_ITR); do --ignore-eos \ --max-concurrency $con \ 2>&1 | tee -a ${LOG}_CONCURRENCY.log >/dev/null + rc=${PIPESTATUS[0]} + if [ $rc -eq 124 ]; then + echo "[STALL] isl=$isl osl=$osl con=$con timed out after ${_scaled_timeout}s" \ + | tee -a ${LOG}_CONCURRENCY.log ${LOG}_STALLS.log >/dev/null + fi sleep 10 done done done + +if [[ "${RUN_PROFILE:-0}" == "1" ]]; then + PROFILE_PORT="${PROFILE_PORT:-${SERVE_PORT:-${SERVER_PORT:-20005}}}" + DECODE_IP="${DECODE_MASTER_ADDR:-${DECODE_MASTER_IP:-}}" + + echo "==== Starting Profiling Phase ====" | tee -a ${LOG}_PROFILE.log + echo "Profile port: ${PROFILE_PORT} Benchmark port: ${BENCHMARK_PORT}" | tee -a ${LOG}_PROFILE.log + echo "Prefill master: 127.0.0.1:${PROFILE_PORT}" | tee -a ${LOG}_PROFILE.log + echo "Decode master: ${DECODE_IP}:${PROFILE_PORT}" | tee -a ${LOG}_PROFILE.log + + echo "--- start_profile on prefill master ---" | tee -a ${LOG}_PROFILE.log + curl -s -X POST http://127.0.0.1:${PROFILE_PORT}/start_profile 2>&1 | tee -a ${LOG}_PROFILE.log + echo "" | tee -a ${LOG}_PROFILE.log + + if [[ -n "${DECODE_IP}" ]]; then + echo "--- start_profile on decode master ---" | tee -a ${LOG}_PROFILE.log + curl -s -X POST http://${DECODE_IP}:${PROFILE_PORT}/start_profile 2>&1 | tee -a ${LOG}_PROFILE.log + echo "" | tee -a ${LOG}_PROFILE.log + fi + + PROMPT=$(python3 -c "print('Hello ' * 170)") + echo "--- Sending inference request via proxy port ${BENCHMARK_PORT} ---" | tee -a ${LOG}_PROFILE.log + curl -s -X POST http://127.0.0.1:${BENCHMARK_PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"${MODEL_PATH}\",\"prompt\":\"${PROMPT}\",\"max_tokens\":1024,\"ignore_eos\":true}" \ + 2>&1 | tee -a ${LOG}_PROFILE.log + echo "" | tee -a ${LOG}_PROFILE.log + + sleep 5 + + echo "--- stop_profile on prefill master ---" | tee -a ${LOG}_PROFILE.log + curl -s -X POST http://127.0.0.1:${PROFILE_PORT}/stop_profile 2>&1 | tee -a ${LOG}_PROFILE.log + echo "" | tee -a ${LOG}_PROFILE.log + + if [[ -n "${DECODE_IP}" ]]; then + echo "--- stop_profile on decode master ---" | tee -a ${LOG}_PROFILE.log + curl -s -X POST http://${DECODE_IP}:${PROFILE_PORT}/stop_profile 2>&1 | tee -a ${LOG}_PROFILE.log + echo "" | tee -a ${LOG}_PROFILE.log + fi + + echo "--- Collecting profile traces ---" | tee -a ${LOG}_PROFILE.log + find /run_logs/${SLURM_JOB_ID}/profiles/ -type f 2>/dev/null | tee -a ${LOG}_PROFILE.log + echo "==== Profiling Phase Complete ====" +fi diff --git a/scripts/vllm_dissag/run_xPyD_models.slurm b/scripts/vllm_dissag/run_xPyD_models.slurm index 87c9713..345f48a 100755 --- a/scripts/vllm_dissag/run_xPyD_models.slurm +++ b/scripts/vllm_dissag/run_xPyD_models.slurm @@ -1,7 +1,5 @@ #!/bin/bash #SBATCH --job-name=vllm-pd # Specify a custom string for your slurm batch job -#SBATCH -N 2 # Request xP + yD nodes (proxy co-located on prefill master) -#SBATCH -n 2 # Request xP + yD total tasks #SBATCH --ntasks-per-node=1 #SBATCH --spread-job #SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) @@ -122,10 +120,6 @@ if [[ "$_run_mori" == "1" && "$_run_deepep" == "1" ]]; then fi if [[ "$_run_mori" == "1" ]]; then - if (( xP != 1 || yD != 1 )); then - echo "Error: RUN_MORI=1 requires xP=1 and yD=1 (got xP=$xP, yD=$yD). Multi-node MoRI EP is not yet supported." >&2 - exit 1 - fi if model_allows_mori_ep "$MODEL_NAME"; then RUN_FILE="vllm_disagg_mori_ep.sh" echo "RUN_MORI=1: using $RUN_FILE for model '$MODEL_NAME'" @@ -387,18 +381,26 @@ fuser -k 15000/tcp 2>/dev/null || true; sleep 2; docker pull $DOCKER_IMAGE_NAME 2>/dev/null || true; +# --- Create host-local compilation cache dirs (ext4, survives container restarts) --- +mkdir -p /tmp/vllm_cache/{aiter_jit,triton,vllm,comgr} 2>/dev/null || true; + # --- Build host RDMA library mounts --- -# Mount the host MLNX OFED userspace libraries into the container so that -# libmlx5 / libibverbs always match the host kernel module, preventing -# mlx5dv_devx_alloc_uar failures from ABI mismatches. _RDMA_MOUNTS="" _LIBDIR=/usr/lib/x86_64-linux-gnu -for _lib in libmlx5.so libmlx5.so.1 libibverbs.so libibverbs.so.1 librdmacm.so librdmacm.so.1; do + +for _lib in libibverbs.so libibverbs.so.1 librdmacm.so librdmacm.so.1; do [ -e "$_LIBDIR/$_lib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_LIBDIR/$_lib:$_LIBDIR/$_lib:ro" done -for _vlib in $_LIBDIR/libmlx5.so.1.* $_LIBDIR/libibverbs.so.1.* $_LIBDIR/librdmacm.so.1.*; do +for _vlib in $_LIBDIR/libibverbs.so.1.* $_LIBDIR/librdmacm.so.1.*; do [ -e "$_vlib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_vlib:$_vlib:ro" done + +for _pattern in libmlx5.so* libionic*.so* libbnxt_re*.so* libefa.so* libhns.so*; do + for _vlib in $_LIBDIR/${_pattern}; do + [ -e "$_vlib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_vlib:$_vlib:ro" + done +done + [ -d "$_LIBDIR/libibverbs" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_LIBDIR/libibverbs:$_LIBDIR/libibverbs:ro" [ -d /etc/libibverbs.d ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v /etc/libibverbs.d:/etc/libibverbs.d:ro" echo "[host-rdma] mounts: $_RDMA_MOUNTS" @@ -419,8 +421,10 @@ docker run --rm \ -v $HOME/.ssh:/root/.ssh \ --shm-size 64G \ --ulimit nofile=524288:524288 \ + --ulimit memlock=-1:-1 \ -v ${LOG_PATH}:/run_logs \ -v $NIXL_REPO_DIR:$NIXL_COOKBOOK_PATH \ + -v /tmp/vllm_cache:/tmp/vllm_cache \ $_RDMA_MOUNTS \ --entrypoint /bin/bash \ -e SLURM_JOB_ID=$SLURM_JOB_ID \ @@ -438,15 +442,48 @@ docker run --rm \ -e BENCHMARK_ITR=$BENCHMARK_ITR \ -e BENCHMARK_CON="${BENCHMARK_CON}" \ -e BENCHMARK_COMBINATIONS="${BENCHMARK_COMBINATIONS}" \ + -e BENCHMARK_PORT=${BENCHMARK_PORT:-} \ + -e PROXY_TYPE=${PROXY_TYPE:-} \ + -e ROUTER_PORT=${ROUTER_PORT:-} \ -e IPADDRS=$IPADDRS \ - -e PROXY_TYPE=${PROXY_TYPE:-vllm_router} \ - -e ROUTER_PORT=${ROUTER_PORT:-18001} \ - -e BENCHMARK_PORT=${BENCHMARK_PORT:-18001} \ -e PREFILL_DEEPEP_BACKEND=${PREFILL_DEEPEP_BACKEND:-} \ -e DECODE_DEEPEP_BACKEND=${DECODE_DEEPEP_BACKEND:-} \ -e ENABLE_DBO=${ENABLE_DBO:-} \ -e DBO_COMM_SMS=${DBO_COMM_SMS:-} \ -e ENABLE_PROFILING=${ENABLE_PROFILING:-} \ + -e NCCL_IB_HCA=${NCCL_IB_HCA:-} \ + -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX:-} \ + -e NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-} \ + -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC:-} \ + -e NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-} \ + -e GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-} \ + -e MORI_SOCKET_IFNAME=${MORI_SOCKET_IFNAME:-eth0} \ + -e MORI_IB_GID_INDEX=${MORI_IB_GID_INDEX:-} \ + ${MORI_RDMA_DEVICES:+-e MORI_RDMA_DEVICES=${MORI_RDMA_DEVICES}} \ + ${MORI_NUM_QP_PER_PE:+-e MORI_NUM_QP_PER_PE=${MORI_NUM_QP_PER_PE}} \ + ${VLLM_MORIIO_QP_PER_TRANSFER:+-e VLLM_MORIIO_QP_PER_TRANSFER=${VLLM_MORIIO_QP_PER_TRANSFER}} \ + ${VLLM_MORIIO_NUM_WORKERS:+-e VLLM_MORIIO_NUM_WORKERS=${VLLM_MORIIO_NUM_WORKERS}} \ + -e RUN_PROFILE=${RUN_PROFILE:-0} \ + -e PROFILE_PORT=${PROFILE_PORT:-} \ + -e GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.8} \ + -e GPUS_PER_NODE=${GPUS_PER_NODE:-8} \ + -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES:-} \ + -e HIP_FORCE_DEV_KERNARG=${HIP_FORCE_DEV_KERNARG:-} \ + -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM:-} \ + -e VLLM_HANDSHAKE_TIMEOUT_MINS=${VLLM_HANDSHAKE_TIMEOUT_MINS:-} \ + -e VLLM_ENGINE_READY_TIMEOUT_S=${VLLM_ENGINE_READY_TIMEOUT_S:-} \ + -e ROCSHMEM_HEAP_SIZE=${ROCSHMEM_HEAP_SIZE:-} \ + -e ROCSHMEM_MAX_NUM_CONTEXTS=${ROCSHMEM_MAX_NUM_CONTEXTS:-} \ + -e LOG_WAIT_TIMEOUT_SECONDS=${LOG_WAIT_TIMEOUT_SECONDS:-} \ + -e TRITON_CACHE_DIR=${TRITON_CACHE_DIR:-/tmp/vllm_cache/triton} \ + -e VLLM_CACHE_ROOT=${VLLM_CACHE_ROOT:-/tmp/vllm_cache/vllm} \ + -e COMGR_CACHE_DIR=${COMGR_CACHE_DIR:-/tmp/vllm_cache/comgr} \ + -e AITER_JIT_DIR=${AITER_JIT_DIR:-/tmp/vllm_cache/aiter_jit} \ + -e DISTRIBUTED_TIMEOUT_SECONDS=${DISTRIBUTED_TIMEOUT_SECONDS:-7200} \ + -e VLLM_RPC_TIMEOUT=${VLLM_RPC_TIMEOUT:-300000} \ + -e VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=${VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS:-3600} \ + -e VLLM_CUDAGRAPH_MODE=${VLLM_CUDAGRAPH_MODE:-} \ + -e CUDAGRAPH_CAPTURE_SIZES="${CUDAGRAPH_CAPTURE_SIZES:-}" \ --name $DOCKER_CONT_NAME \ $DOCKER_IMAGE_NAME -c " mkdir -p /run_logs/${SLURM_JOB_ID} diff --git a/scripts/vllm_dissag/vllm_disagg_mori_ep.sh b/scripts/vllm_dissag/vllm_disagg_mori_ep.sh index db5dd57..45c1900 100755 --- a/scripts/vllm_dissag/vllm_disagg_mori_ep.sh +++ b/scripts/vllm_dissag/vllm_disagg_mori_ep.sh @@ -1,6 +1,9 @@ #!/bin/bash # VLLM Disaggregated Server Launcher - MoRI EP Configuration # ============================================================================= +# Supports multi-node xP/yD topologies with co-located proxy on NODE 0. +# Applies vLLM PR #39276 patches at runtime for multi-node DP support. +# ============================================================================= # ============================================================================= # Environment Configuration @@ -14,10 +17,7 @@ MODEL_PATH=$MODEL_PATH MODEL_NAME="${MODEL_NAME:-}" xP="${xP:-1}" yD="${yD:-1}" -if [ "$xP" -gt 1 ] || [ "$yD" -gt 1 ]; then - echo "Error: xP > 1 or yD > 1 is not supported yet due to MoRI IO connector issues." >&2 - exit 1 -fi +echo "MoRI EP topology: xP=${xP} yD=${yD} (total nodes=$((xP + yD)))" IPADDRS="${IPADDRS:-localhost}" IFS=',' read -ra IP_ARRAY <<< "${IPADDRS}" @@ -28,25 +28,26 @@ ls ${NIXL_COOKBOOK_PATH} # Port Configuration # ============================================================================= -RPC_PORT=13345 -SERVE_PORT=20005 -KV_PORT=9711 -PROXY_PORT=10001 -PROXY_PING_PORT=36367 -LOCAL_PING_PORT=61555 -HANDSHAKE_PORT=8405 -NOTIFY_PORT=61005 +RPC_PORT="${MORI_RPC_PORT:-13345}" +SERVE_PORT="${MORI_SERVE_PORT:-20005}" +KV_PORT="${MORI_KV_PORT:-9711}" +PROXY_PORT="${MORI_PROXY_PORT:-10001}" +PROXY_PING_PORT="${MORI_PROXY_PING_PORT:-36367}" +LOCAL_PING_PORT="${MORI_LOCAL_PING_PORT:-61555}" +HANDSHAKE_PORT="${MORI_HANDSHAKE_PORT:-8405}" +NOTIFY_PORT="${MORI_NOTIFY_PORT:-61005}" # ============================================================================= # Node-Specific Configuration # ============================================================================= -PREFILL_DP_SIZE=$((xP * 8)) -DECODE_DP_SIZE=$((yD * 8)) -DP_PARALLEL_SIZE_LOCAL=8 -PREFILL_DP_START_RANK=$(( NODE_RANK * 8 )) +_GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +PREFILL_DP_SIZE=$((xP * _GPUS_PER_NODE)) +DECODE_DP_SIZE=$((yD * _GPUS_PER_NODE)) +DP_PARALLEL_SIZE_LOCAL=${_GPUS_PER_NODE} +PREFILL_DP_START_RANK=$(( NODE_RANK * _GPUS_PER_NODE )) PREFILL_MASTER_ADDR=$(echo "$IPADDRS" | awk -F',' '{print $1}') -DECODE_DP_START_RANK=$(( (NODE_RANK - xP) * 8 )) +DECODE_DP_START_RANK=$(( (NODE_RANK - xP) * _GPUS_PER_NODE )) DECODE_MASTER_ADDR=$(echo "$IPADDRS" | awk -F',' -v pos="$xP" '{print $(pos+1)}') echo "-----------------------------Printing node specific details ----------------------" @@ -69,15 +70,71 @@ host_name=$(hostname) setup_mori_env() { export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_MOE=1 - export VLLM_LOGGING_LEVEL=INFO - export VLLM_USE_V1=1 export VLLM_ROCM_USE_AITER_MLA=1 + export VLLM_ROCM_USE_AITER_RMSNORM="${VLLM_ROCM_USE_AITER_RMSNORM:-1}" export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 + export VLLM_ROCM_USE_AITER_PAGED_ATTN=0 + export VLLM_USE_AITER_TRITON_SILU_MUL=0 + + export VLLM_LOGGING_LEVEL=INFO + export VLLM_USE_V1=1 export VLLM_ALL2ALL_BACKEND=mori - export GLOO_SOCKET_IFNAME=eth0 - export VLLM_ENGINE_READY_TIMEOUT_S=3600 - export VLLM_RINGBUFFER_WARNING_INTERVAL=3600 - export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=3600 + + # Network — values from cluster profile via Docker env, or defaults + export GLOO_SOCKET_IFNAME="${GLOO_SOCKET_IFNAME:-${MORI_SOCKET_IFNAME:-eth0}}" + export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-${MORI_SOCKET_IFNAME:-eth0}}" + + # Timeouts — generous values so AITER JIT compilation on the first run + # doesn't trip internal watchdogs. + export VLLM_ENGINE_READY_TIMEOUT_S="${VLLM_ENGINE_READY_TIMEOUT_S:-10800}" + export VLLM_RINGBUFFER_WARNING_INTERVAL="${VLLM_RINGBUFFER_WARNING_INTERVAL:-3600}" + export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS="${VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS:-3600}" + export VLLM_RPC_TIMEOUT="${VLLM_RPC_TIMEOUT:-300000}" + + # RDMA / NCCL tuning — cluster-specific via Docker env + export NCCL_IB_HCA="${NCCL_IB_HCA:-mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9}" + export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}" + export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-3}" + export NCCL_CROSS_NIC="${NCCL_CROSS_NIC:-1}" + export MORI_IB_GID_INDEX="${MORI_IB_GID_INDEX:-3}" + export MORI_RDMA_DEVICES="${MORI_RDMA_DEVICES:-mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9}" + + # MoRI EP QP tuning + export MORI_NUM_QP_PER_PE="${MORI_NUM_QP_PER_PE:-4}" + export VLLM_MORIIO_QP_PER_TRANSFER="${VLLM_MORIIO_QP_PER_TRANSFER:-4}" + export VLLM_MORIIO_NUM_WORKERS="${VLLM_MORIIO_NUM_WORKERS:-4}" + + # MoRIIO robustness timeouts (used by PR #39276 patches) + export VLLM_MORIIO_TRANSFER_TIMEOUT_S="${VLLM_MORIIO_TRANSFER_TIMEOUT_S:-600}" + export VLLM_MORIIO_DEFERRED_TIMEOUT_S="${VLLM_MORIIO_DEFERRED_TIMEOUT_S:-1800}" + export VLLM_HANDSHAKE_TIMEOUT_MINS="${VLLM_HANDSHAKE_TIMEOUT_MINS:-30}" + + # Compilation caches — host-local bind-mount avoids NFS file-lock races + export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/vllm_cache/triton}" + export VLLM_CACHE_ROOT="${VLLM_CACHE_ROOT:-/tmp/vllm_cache/vllm}" + export COMGR_CACHE_DIR="${COMGR_CACHE_DIR:-/tmp/vllm_cache/comgr}" + export AITER_JIT_DIR="${AITER_JIT_DIR:-/tmp/vllm_cache/aiter_jit}" + mkdir -p "${TRITON_CACHE_DIR}" "${VLLM_CACHE_ROOT}" "${COMGR_CACHE_DIR}" "${AITER_JIT_DIR}" 2>/dev/null || true + + # Pre-populate AITER tuning CSVs to prevent CSV race condition + if [[ "${VLLM_ROCM_USE_AITER:-1}" == "1" ]]; then + local _aiter_cfgs="/tmp/aiter_configs" + local _aiter_src="/usr/local/lib/python3.12/dist-packages/aiter/configs" + if [ -d "${_aiter_src}" ] && [ ! -f "${_aiter_cfgs}/a8w8_blockscale_tuned_gemm.csv" ]; then + mkdir -p "${_aiter_cfgs}" + cp "${_aiter_src}"/*.csv "${_aiter_cfgs}/" 2>/dev/null || true + fi + fi + + # GPU / ROCm tuning + export GPU_MAX_HW_QUEUES="${GPU_MAX_HW_QUEUES:-2}" + export HIP_FORCE_DEV_KERNARG="${HIP_FORCE_DEV_KERNARG:-1}" + export HSA_ENABLE_SDMA="${HSA_ENABLE_SDMA:-0}" + export HSA_NO_SCRATCH_RECLAIM="${HSA_NO_SCRATCH_RECLAIM:-1}" + + # RocSHMEM + export ROCSHMEM_HEAP_SIZE="${ROCSHMEM_HEAP_SIZE:-8589934592}" + export ROCSHMEM_MAX_NUM_CONTEXTS="${ROCSHMEM_MAX_NUM_CONTEXTS:-256}" } build_kv_transfer_config() { @@ -103,14 +160,48 @@ launch_vllm_worker() { setup_mori_env local extra_args=() + local kv_args=() + if [[ "$role" == "master" ]]; then - extra_args+=(--api-server-count=8) + extra_args+=(--api-server-count=${_GPUS_PER_NODE}) + # Fix 6: only master nodes get --kv-transfer-config. + # Child nodes join via --headless and participate in EP all-to-all + # but do not perform KV transfers. + local kv_config + kv_config=$(build_kv_transfer_config "${kv_role}") + kv_args+=(--kv-transfer-config "${kv_config}") else extra_args+=(--data-parallel-start-rank "${dp_start_rank}" --headless) fi - local kv_config - kv_config=$(build_kv_transfer_config "${kv_role}") + # Patch PyTorch's default_pg_timeout so DP Gloo groups use our timeout + # instead of the 30-min default. + local _timeout_s="${DISTRIBUTED_TIMEOUT_SECONDS:-7200}" + local _torch_const="/usr/local/lib/python3.12/dist-packages/torch/distributed/constants.py" + if [ -f "$_torch_const" ]; then + sed -i "s/default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT/default_pg_timeout: timedelta = timedelta(seconds=${_timeout_s})/" "$_torch_const" 2>/dev/null || true + fi + + # Execution mode: prefill always uses eager; decode can optionally use + # CUDA graphs via VLLM_CUDAGRAPH_MODE (e.g. FULL_DECODE_ONLY). + # --enforce-eager overrides --compilation-config, so they are mutually exclusive. + local exec_args=() + local _cudagraph_mode="${VLLM_CUDAGRAPH_MODE:-}" + if [[ "$log_prefix" == "decode" && -n "$_cudagraph_mode" && "$_cudagraph_mode" != "NONE" ]]; then + local _capture_sizes="${CUDAGRAPH_CAPTURE_SIZES:-1 2 4 8 16 32 64 128 256}" + exec_args+=(--compilation-config '{"cudagraph_mode":"'"${_cudagraph_mode}"'","custom_ops":["+quant_fp8"]}') + exec_args+=(--cudagraph-capture-sizes ${_capture_sizes}) + else + exec_args+=(--enforce-eager) + fi + + local profiler_args=() + if [[ "${RUN_PROFILE:-0}" == "1" ]]; then + local _profile_dir="/run_logs/${SLURM_JOB_ID}/profiles/${log_prefix}_NODE${NODE_RANK}" + mkdir -p "${_profile_dir}" + profiler_args+=(--profiler-config "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${_profile_dir}\"}") + echo "Profiler enabled for ${log_prefix} ${role} NODE${NODE_RANK} → ${_profile_dir}" + fi vllm serve ${MODEL_PATH} \ -tp 1 \ @@ -120,15 +211,17 @@ launch_vllm_worker() { --data-parallel-rpc-port ${RPC_PORT} \ --enable-expert-parallel \ --port ${SERVE_PORT} \ - --gpu-memory-utilization 0.8 \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION:-0.8} \ --kv-cache-dtype fp8 \ --block-size 1 \ --no-enable-prefix-caching \ --all2all-backend mori \ --trust-remote-code \ - --enforce-eager \ + --distributed-timeout-seconds ${DISTRIBUTED_TIMEOUT_SECONDS:-7200} \ + "${profiler_args[@]}" \ + "${exec_args[@]}" \ "${extra_args[@]}" \ - --kv-transfer-config "${kv_config}" \ + "${kv_args[@]}" \ 2>&1 | tee /run_logs/${SLURM_JOB_ID}/${log_prefix}_NODE${NODE_RANK}.log >/dev/null & WORKER_PID=$! @@ -165,7 +258,8 @@ print_node_info() { # Container Synchronization # ============================================================================= -for _pid in $(ss -tlnp sport = 2222 2>/dev/null | grep -oP "pid=\K\d+"); do +_BARRIER_PORT="${BARRIER_PORT_MORI:-2222}" +for _pid in $(ss -tlnp sport = ${_BARRIER_PORT} 2>/dev/null | grep -oP "pid=\K\d+"); do kill -9 "$_pid" 2>/dev/null done sleep 2 @@ -173,10 +267,37 @@ sleep 2 echo "Waiting at the container creation barrier on $host_name" python $NIXL_COOKBOOK_PATH/socket_barrier.py \ --local-ip ${host_ip} \ - --local-port 2222 \ + --local-port ${_BARRIER_PORT} \ --enable-port \ --node-ips ${IPADDRS} \ - --node-ports 2222 + --node-ports ${_BARRIER_PORT} + +# ============================================================================= +# Runtime Patches — Apply vLLM PR #39276 for multi-node DP support +# ============================================================================= + +PATCH_SCRIPT="${NIXL_COOKBOOK_PATH:-$(dirname "$0")}/apply_moriio_2pd_patches.sh" +_PATCH_REQUIRED=0 +if [ "$xP" -gt 1 ] || [ "$yD" -gt 1 ]; then + _PATCH_REQUIRED=1 +fi + +if [ -f "${PATCH_SCRIPT}" ]; then + echo "Applying runtime patches (PR #39276)..." + if ! bash "${PATCH_SCRIPT}" 2>&1; then + if [ "$_PATCH_REQUIRED" -eq 1 ]; then + echo "Error: runtime patch failed but multi-node DP requires PR #39276 (xP=${xP}, yD=${yD}). Aborting." + exit 1 + fi + echo "Warning: runtime patch failed — continuing (1P/1D does not strictly require it)" + fi +else + if [ "$_PATCH_REQUIRED" -eq 1 ]; then + echo "Error: ${PATCH_SCRIPT} not found but multi-node DP requires PR #39276 (xP=${xP}, yD=${yD}). Aborting." + exit 1 + fi + echo "Warning: ${PATCH_SCRIPT} not found — skipping runtime patches" +fi # ============================================================================= # Node Role Assignment and Server Launch @@ -199,7 +320,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Waiting for prefill & decode servers to be ready..." sleep 20 - TIMEOUT_SECONDS=4000 + TIMEOUT_SECONDS="${LOG_WAIT_TIMEOUT_SECONDS:-4000}" SLEEP_SECONDS=10 SEARCH_SIGNAL="Application startup complete." @@ -242,6 +363,8 @@ if [ "$NODE_RANK" -eq 0 ]; then sleep 20 export BENCHMARK_PORT=${PROXY_PORT} + export DECODE_MASTER_ADDR + export SERVE_PORT bash $NIXL_COOKBOOK_PATH/benchmark_xPyD.sh echo "Killing the proxy server.." diff --git a/scripts/vllm_dissag/vllm_disagg_server_deepep.sh b/scripts/vllm_dissag/vllm_disagg_server_deepep.sh index cd1b8ed..9bc3529 100755 --- a/scripts/vllm_dissag/vllm_disagg_server_deepep.sh +++ b/scripts/vllm_dissag/vllm_disagg_server_deepep.sh @@ -296,6 +296,14 @@ launch_vllm_worker() { local kv_config kv_config=$(build_kv_transfer_config "${kv_role}" "${engine_id}" "${dp_size}") + local profiler_args=() + if [[ "${RUN_PROFILE:-0}" == "1" ]]; then + local _profile_dir="/run_logs/${SLURM_JOB_ID}/profiles/${log_prefix}_NODE${NODE_RANK}" + mkdir -p "${_profile_dir}" + profiler_args+=(--profiler-config "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${_profile_dir}\"}") + echo "Profiler enabled for ${role} NODE${NODE_RANK} → ${_profile_dir}" + fi + vllm serve "${MODEL_PATH}" \ --port "${SERVER_PORT}" \ --trust-remote-code \ @@ -307,11 +315,12 @@ launch_vllm_worker() { --master-addr "${dp_addr}" \ "${compile_args[@]}" \ --no-enable-prefix-caching --block-size 1 \ - --gpu-memory-utilization 0.8 \ + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION:-0.8} \ --kv-cache-dtype fp8 \ --enable-expert-parallel \ --all2all-backend "${backend}" \ ${DBO_ARGS} \ + "${profiler_args[@]}" \ "${extra_args[@]}" \ --kv-transfer-config "${kv_config}" \ 2>&1 | tee /run_logs/${SLURM_JOB_ID}/${log_prefix}_NODE${NODE_RANK}.log >/dev/null & @@ -464,6 +473,8 @@ if [ "$NODE_RANK" -eq 0 ]; then sleep 10 export BENCHMARK_PORT="${PROXY_PORT}" + export DECODE_MASTER_IP + export SERVER_PORT bash "${NIXL_COOKBOOK_PATH}/benchmark_xPyD.sh" echo "Killing proxy server"