Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
447 changes: 447 additions & 0 deletions contrib/models/MiMo-V2.5/README.md

Large diffs are not rendered by default.

113 changes: 113 additions & 0 deletions contrib/models/MiMo-V2.5/perf_test/0_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/bin/bash
# Setup for MiMo-V2.5 vLLM benchmarking on Trn2.
#
# Clones upstream vllm-project/vllm-neuron at release-0.5.0 and applies
# vllm-neuron-patch.patch, which adds a runtime registration hook so the
# contrib NeuronMiMoV2ForCausalLM is plugged into both NxDI's MODEL_TYPES
# (under the key "mimov2") and vLLM's ModelRegistry (as
# MiMoV2ForCausalLM) at vllm-neuron plugin init time.
#
# Then downloads XiaomiMiMo/MiMo-V2.5 from HuggingFace (FP8 blockwise, ~320 GB).
set -e

echo "=========================================="
echo "Setup: vllm-neuron + MiMo-V2.5 weights"
echo "=========================================="

source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate

# Resolve repo-relative paths up front — we cd into $HOME/vllm-neuron below,
# after which $0's relative form would no longer resolve.
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PATCH_FILE="$SCRIPT_DIR/vllm-neuron-patch.patch"
CONTRIB_SRC="$(cd "$SCRIPT_DIR/.." && pwd)/src"

echo ""
echo "[1/2] Installing vllm-neuron (release-0.5.0) with the contrib registration patch..."

if [ ! -d $HOME/vllm-neuron ]; then
git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git $HOME/vllm-neuron
fi

cd $HOME/vllm-neuron

# Apply patch (idempotent via `git apply --check` first).
if git apply --check "$PATCH_FILE" 2>/dev/null; then
git apply "$PATCH_FILE"
echo " Applied $PATCH_FILE"
else
echo " Patch already applied or conflicts; continuing."
fi

pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e .

python3 -c "import vllm_neuron; print('vllm-neuron installed:', vllm_neuron.__file__)"

echo ""
echo "[2/2] Downloading MiMo-V2.5 FP8 weights from HuggingFace..."

MIMO_PATH="${MIMO_V2_5_PATH:-/opt/dlami/nvme/models/MiMo-V2.5}"
if [ -d "$MIMO_PATH" ] && [ "$(ls "$MIMO_PATH"/*.safetensors 2>/dev/null | wc -l)" -gt 0 ]; then
echo " MiMo-V2.5 weights already exist at $MIMO_PATH, skipping download"
else
mkdir -p "$MIMO_PATH"
huggingface-cli download XiaomiMiMo/MiMo-V2.5 --local-dir "$MIMO_PATH" --max-workers 16
echo " Download complete: $(du -sh $MIMO_PATH | cut -f1)"
fi

NEURON_FP8_PATH="${MIMO_PATH}-Neuron-FP8"
COMPILED_PATH="/opt/dlami/nvme/compiled/mimo_v2_5_bs32_moetp1_ep64_fp8_vllm"

echo ""
echo "========================================================================"
echo "Next steps"
echo "========================================================================"
echo ""
echo "1. Preprocess the FP8 checkpoint for Neuron (~16 min, ~15 GB peak RAM):"
echo ""
echo " python $CONTRIB_SRC/conversion_script/preprocess_mimo_v2_5_fp8.py \\"
echo " --hf_model_path $MIMO_PATH \\"
echo " --save_path $NEURON_FP8_PATH \\"
echo " --tp_degree 64"
echo ""
echo "2. Export the environment variables used by the smoke / bench scripts:"
echo ""
echo " # --- Required ---"
echo " # Contrib package src (registers NeuronMiMoV2ForCausalLM with vllm-neuron)."
echo " export NXDI_CONTRIB_MIMO_V2_5_SRC=$CONTRIB_SRC"
echo " # vLLM's builtin arch validator only knows MiMoV2FlashForCausalLM, so the"
echo " # preprocess rewrites the checkpoint's config.json architectures to that"
echo " # name. Alias V2.5 src to the Flash env var so vllm-neuron's contrib hook"
echo " # registers mimov2flash -> our V2.5 NeuronMiMoV2ForCausalLM class."
echo " export NXDI_CONTRIB_MIMO_V2_FLASH_SRC=\"\$NXDI_CONTRIB_MIMO_V2_5_SRC\""
echo " # Preprocessed Neuron-FP8 checkpoint."
echo " export MIMO_V2_5_PATH=$NEURON_FP8_PATH"
echo ""
echo " # --- Optional (recommended) ---"
echo " # vLLM compiles into <checkpoint>/neuron-compiled-artifacts/<hash>/ by"
echo " # default. Pin it to a persistent shared location so multiple configs"
echo " # don't collide and you can reuse the NEFF / sharded weights across runs."
echo " export NEURON_COMPILED_ARTIFACTS=$COMPILED_PATH"
echo " # NxDI's HLO/NEFF staging workdir (.hlo_module.pb etc). Default is"
echo " # /tmp/nxd_model/<compile-name>/; on Trn2 /tmp is wiped by the nightly"
echo " # reboot, and parallel compiles sharing the same basename silently"
echo " # overwrite each other's staged HLOs. Pin to a unique per-config"
echo " # directory under persistent storage."
echo " export BASE_COMPILE_WORK_DIR=/opt/dlami/nvme/tmp/nxd_model/\$(basename $COMPILED_PATH)"
echo " # First-time compile of V2.5's 256-expert MoE takes ~30 min (NEFF HLO +"
echo " # shard_checkpoint for 64 ranks). Extend vLLM's ready timeout."
echo " export VLLM_ENGINE_READY_TIMEOUT_S=7200"
echo ""
echo "3a. Run the one-shot benchmark (launches + benches + tears down):"
echo ""
echo " bash $SCRIPT_DIR/bench_mimo_v2_5.sh"
echo ""
echo "3b. ...OR keep a server up and probe it manually:"
echo ""
echo " # shell 1: server in foreground (Ctrl-C to stop)"
echo " bash $SCRIPT_DIR/start_vllm_server.sh"
echo ""
echo " # shell 2: once 'Application startup complete.' prints,"
echo " bash $SCRIPT_DIR/sanity_check.sh"
echo " CONCURRENCY=16 NUM_PROMPTS=128 bash $SCRIPT_DIR/run_bench_single.sh"
echo ""
87 changes: 87 additions & 0 deletions contrib/models/MiMo-V2.5/perf_test/bench_mimo_v2_5.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash
set -e

# MiMo-V2.5 FP8 vLLM benchmark on Trn2. One-shot wrapper:
# launch server -> sanity check -> bench at c=1,16,32 -> stop server.
#
# This script composes three building blocks in perf_test/:
# start_vllm_server.sh - server launch + env-var setup (backgrounded here)
# sanity_check.sh - one-shot curl against the running server
# run_bench_single.sh - one concurrency level of `vllm bench serve`
#
# Use those directly if you want to keep a long-running server and iterate
# on bench parameters from another shell.
#
# Server recipe: TP=64, moe_tp=1/moe_ep=64, BS=32, continuous batching.
# BS=32 is the smallest working batch size on the FP8 path (NxDI's TKG
# path refuses Expert Parallelism with BS < num_experts/top_k = 256/8 = 32).
# BS=1 single-stream latency demos are not currently supported on V2.5 FP8.

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PORT="${PORT:-8000}"
RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5}"
CONFIG_NAME="bs32_tp64_moetp1_ep64"

mkdir -p "$RESULTS_DIR"

# Wait for vLLM server to be ready. First-time compile of the 256-expert
# MoE model takes ~30 min and can stretch past 2 h under contention, so
# poll for up to 2 h.
wait_for_server() {
echo " Waiting for vLLM server on port $PORT (up to 2 h for first compile)..."
local interval=10
local max_attempts=720
local start=$SECONDS
for i in $(seq 1 $max_attempts); do
if curl -s "http://localhost:$PORT/health" > /dev/null 2>&1; then
echo " Server ready after $((SECONDS - start))s."
return 0
fi
if [ $((i % 6)) -eq 0 ]; then
echo " ...still waiting ($((SECONDS - start))s elapsed)"
fi
sleep $interval
done
echo " ERROR: Server did not start within $((max_attempts * interval))s"
return 1
}

stop_server() {
echo " Stopping vLLM server..."
pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
sleep 5
}

echo "=========================================="
echo "MiMo-V2.5 FP8 Performance Benchmark"
echo "=========================================="
echo "Port: $PORT"
echo "Results: $RESULTS_DIR"
echo ""

# Start the server in the background. start_vllm_server.sh handles all the
# env vars (MODEL_PATH, NEURON_COMPILED_ARTIFACTS, BASE_COMPILE_WORK_DIR,
# contrib src registration, etc.) and execs `python3 -m vllm...`.
bash "$SCRIPT_DIR/start_vllm_server.sh" &
SERVER_PID=$!
trap stop_server EXIT

wait_for_server

# One-shot sanity check (curl the chat endpoint).
PORT="$PORT" bash "$SCRIPT_DIR/sanity_check.sh" || true

# Three concurrency levels. run_bench_single.sh reads knobs from the
# environment; see its header for all the options.
PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
CONCURRENCY=1 NUM_PROMPTS=16 bash "$SCRIPT_DIR/run_bench_single.sh"
PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
CONCURRENCY=16 NUM_PROMPTS=128 bash "$SCRIPT_DIR/run_bench_single.sh"
PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
CONCURRENCY=32 NUM_PROMPTS=128 bash "$SCRIPT_DIR/run_bench_single.sh"

echo "=========================================="
echo "MiMo-V2.5 FP8 benchmark complete!"
echo "Results saved to: $RESULTS_DIR"
echo "=========================================="
ls -la "$RESULTS_DIR"
76 changes: 76 additions & 0 deletions contrib/models/MiMo-V2.5/perf_test/run_bench_single.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash
# Run a single vllm-bench-serve pass against an already-running vLLM server.
#
# Unlike bench_mimo_v2_5.sh this script does NOT launch or kill the vLLM
# server — you bring your own. That makes it convenient when the bench driver
# in bench_mimo_v2_5.sh times out during first-time compilation: the server
# keeps running, and once it's ready you can collect numbers with this.
#
# Usage:
# bash run_bench_single.sh # defaults: c=1, 16 prompts
# CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
# CONFIG_NAME=bs32_tp1_ep64_opt CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
#
# Environment knobs:
# PORT vLLM server port (default 8000)
# MIMO_V2_5_PATH Path to the Neuron-FP8 checkpoint (default
# /opt/dlami/nvme/models/MiMo-V2.5-Neuron-FP8)
# CONCURRENCY --max-concurrency (default 1)
# NUM_PROMPTS --num-prompts (default 16)
# INPUT_LEN --random-input-len (default 900)
# OUTPUT_LEN --random-output-len (default 90)
# RANGE_RATIO --random-range-ratio (default 0.03)
# CONFIG_NAME Used in the output filename (default bs1_tp64_ep1)
# RESULTS_DIR Where to dump per-run log (default /opt/dlami/nvme/logs/bench_results/mimo_v2_5)

set -e

source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate

MODEL_PATH="${MIMO_V2_5_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Neuron-FP8}"
PORT="${PORT:-8000}"
CONCURRENCY="${CONCURRENCY:-1}"
NUM_PROMPTS="${NUM_PROMPTS:-16}"
INPUT_LEN="${INPUT_LEN:-900}"
OUTPUT_LEN="${OUTPUT_LEN:-90}"
RANGE_RATIO="${RANGE_RATIO:-0.03}"
CONFIG_NAME="${CONFIG_NAME:-bs1_tp64_ep1}"
RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5}"

mkdir -p "$RESULTS_DIR"

echo "=========================================="
echo "MiMo-V2.5 single-run benchmark"
echo "=========================================="
echo " Model: $MODEL_PATH"
echo " Port: $PORT"
echo " Config: $CONFIG_NAME"
echo " Concurrency: $CONCURRENCY"
echo " Prompts: $NUM_PROMPTS"
echo " Input len: $INPUT_LEN Output len: $OUTPUT_LEN"
echo " Results: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
echo ""

# Quick health check
if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
echo "Start it first (e.g., bench_mimo_v2_5.sh) and wait until"
echo "'Application startup complete.' is printed."
exit 1
fi

vllm bench serve \
--backend vllm \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--endpoint /v1/completions \
--dataset-name random \
--num-prompts "$NUM_PROMPTS" \
--random-input-len "$INPUT_LEN" \
--random-output-len "$OUTPUT_LEN" \
--random-range-ratio "$RANGE_RATIO" \
--max-concurrency "$CONCURRENCY" \
2>&1 | tee "$RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"

echo ""
echo "Saved to: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
59 changes: 59 additions & 0 deletions contrib/models/MiMo-V2.5/perf_test/sanity_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
# Quick sanity check against an already-running vLLM server.
#
# Assumes vLLM is already listening on $PORT (default 8000) with MiMo-V2.5
# loaded. Sends a single chat completion and prints the model's reply.
#
# Usage:
# bash sanity_check.sh # uses defaults
# PORT=8001 bash sanity_check.sh # custom port
# PROMPT="..." bash sanity_check.sh # custom prompt

set -e

MODEL_PATH="${MIMO_V2_5_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Neuron-FP8}"
PORT="${PORT:-8000}"
PROMPT="${PROMPT:-What is 1+1? Answer briefly.}"
MAX_TOKENS="${MAX_TOKENS:-64}"

echo "Sanity check: POST /v1/chat/completions on port $PORT"
echo " Model: $MODEL_PATH"
echo " Prompt: $PROMPT"
echo " Max tokens: $MAX_TOKENS"
echo ""

# Health check first — fail fast if server isn't up.
if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
echo "Start it with 'bash bench_mimo_v2_5.sh' or your own launcher first."
exit 1
fi

RESPONSE=$(curl -s "http://localhost:$PORT/v1/chat/completions" \
-H 'Content-Type: application/json' \
-d "$(cat <<EOF
{
"messages": [{"role": "user", "content": "$PROMPT"}],
"model": "$MODEL_PATH",
"max_tokens": $MAX_TOKENS,
"temperature": 0.0,
"stream": false
}
EOF
)")

echo "Response:"
echo "$RESPONSE" | python3 -m json.tool 2>/dev/null || echo "$RESPONSE"
echo ""

# Extract the model's reply for a human-friendly one-liner summary.
REPLY=$(echo "$RESPONSE" | python3 -c "
import json, sys
try:
r = json.load(sys.stdin)
print(r['choices'][0]['message']['content'].strip())
except Exception as e:
print(f'(could not parse reply: {e})')
" 2>/dev/null)

echo "Model reply: $REPLY"
Loading