aws-neuron · whn09 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 28, 2026
diff --git a/contrib/models/MiMo-V2.5/README.md b/contrib/models/MiMo-V2.5/README.md
diff --git a/contrib/models/MiMo-V2.5/perf_test/0_setup.sh b/contrib/models/MiMo-V2.5/perf_test/0_setup.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Setup for MiMo-V2.5 vLLM benchmarking on Trn2.
+#
+# Clones upstream vllm-project/vllm-neuron at release-0.5.0 and applies
+# vllm-neuron-patch.patch, which adds a runtime registration hook so the
+# contrib NeuronMiMoV2ForCausalLM is plugged into both NxDI's MODEL_TYPES
+# (under the key "mimov2") and vLLM's ModelRegistry (as
+# MiMoV2ForCausalLM) at vllm-neuron plugin init time.
+#
+# Then downloads XiaomiMiMo/MiMo-V2.5 from HuggingFace (FP8 blockwise, ~320 GB).
+set -e
+
+echo "=========================================="
+echo "Setup: vllm-neuron + MiMo-V2.5 weights"
+echo "=========================================="
+
+source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate
+
+# Resolve repo-relative paths up front — we cd into $HOME/vllm-neuron below,
+# after which $0's relative form would no longer resolve.
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PATCH_FILE="$SCRIPT_DIR/vllm-neuron-patch.patch"
+CONTRIB_SRC="$(cd "$SCRIPT_DIR/.." && pwd)/src"
+
+echo ""
+echo "[1/2] Installing vllm-neuron (release-0.5.0) with the contrib registration patch..."
+
+if [ ! -d $HOME/vllm-neuron ]; then
+    git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git $HOME/vllm-neuron
+fi
+
+cd $HOME/vllm-neuron
+
+# Apply patch (idempotent via `git apply --check` first).
+if git apply --check "$PATCH_FILE" 2>/dev/null; then
+    git apply "$PATCH_FILE"
+    echo "  Applied $PATCH_FILE"
+else
+    echo "  Patch already applied or conflicts; continuing."
+fi
+
+pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e .
+
+python3 -c "import vllm_neuron; print('vllm-neuron installed:', vllm_neuron.__file__)"
+
+echo ""
+echo "[2/2] Downloading MiMo-V2.5 FP8 weights from HuggingFace..."
+
+MIMO_PATH="${MIMO_V2_5_PATH:-/opt/dlami/nvme/models/MiMo-V2.5}"
+if [ -d "$MIMO_PATH" ] && [ "$(ls "$MIMO_PATH"/*.safetensors 2>/dev/null | wc -l)" -gt 0 ]; then
+    echo "  MiMo-V2.5 weights already exist at $MIMO_PATH, skipping download"
+else
+    mkdir -p "$MIMO_PATH"
+    huggingface-cli download XiaomiMiMo/MiMo-V2.5 --local-dir "$MIMO_PATH" --max-workers 16
+    echo "  Download complete: $(du -sh $MIMO_PATH | cut -f1)"
+fi
+
+NEURON_FP8_PATH="${MIMO_PATH}-Neuron-FP8"
+COMPILED_PATH="/opt/dlami/nvme/compiled/mimo_v2_5_bs32_moetp1_ep64_fp8_vllm"
+
+echo ""
+echo "========================================================================"
+echo "Next steps"
+echo "========================================================================"
+echo ""
+echo "1. Preprocess the FP8 checkpoint for Neuron (~16 min, ~15 GB peak RAM):"
+echo ""
+echo "     python $CONTRIB_SRC/conversion_script/preprocess_mimo_v2_5_fp8.py \\"
+echo "         --hf_model_path $MIMO_PATH \\"
+echo "         --save_path     $NEURON_FP8_PATH \\"
+echo "         --tp_degree 64"
+echo ""
+echo "2. Export the environment variables used by the smoke / bench scripts:"
+echo ""
+echo "     # --- Required ---"
+echo "     # Contrib package src (registers NeuronMiMoV2ForCausalLM with vllm-neuron)."
+echo "     export NXDI_CONTRIB_MIMO_V2_5_SRC=$CONTRIB_SRC"
+echo "     # vLLM's builtin arch validator only knows MiMoV2FlashForCausalLM, so the"
+echo "     # preprocess rewrites the checkpoint's config.json architectures to that"
+echo "     # name. Alias V2.5 src to the Flash env var so vllm-neuron's contrib hook"
+echo "     # registers mimov2flash -> our V2.5 NeuronMiMoV2ForCausalLM class."
+echo "     export NXDI_CONTRIB_MIMO_V2_FLASH_SRC=\"\$NXDI_CONTRIB_MIMO_V2_5_SRC\""
+echo "     # Preprocessed Neuron-FP8 checkpoint."
+echo "     export MIMO_V2_5_PATH=$NEURON_FP8_PATH"
+echo ""
+echo "     # --- Optional (recommended) ---"
+echo "     # vLLM compiles into <checkpoint>/neuron-compiled-artifacts/<hash>/ by"
+echo "     # default. Pin it to a persistent shared location so multiple configs"
+echo "     # don't collide and you can reuse the NEFF / sharded weights across runs."
+echo "     export NEURON_COMPILED_ARTIFACTS=$COMPILED_PATH"
+echo "     # NxDI's HLO/NEFF staging workdir (.hlo_module.pb etc). Default is"
+echo "     # /tmp/nxd_model/<compile-name>/; on Trn2 /tmp is wiped by the nightly"
+echo "     # reboot, and parallel compiles sharing the same basename silently"
+echo "     # overwrite each other's staged HLOs. Pin to a unique per-config"
+echo "     # directory under persistent storage."
+echo "     export BASE_COMPILE_WORK_DIR=/opt/dlami/nvme/tmp/nxd_model/\$(basename $COMPILED_PATH)"
+echo "     # First-time compile of V2.5's 256-expert MoE takes ~30 min (NEFF HLO +"
+echo "     # shard_checkpoint for 64 ranks). Extend vLLM's ready timeout."
+echo "     export VLLM_ENGINE_READY_TIMEOUT_S=7200"
+echo ""
+echo "3a. Run the one-shot benchmark (launches + benches + tears down):"
+echo ""
+echo "      bash $SCRIPT_DIR/bench_mimo_v2_5.sh"
+echo ""
+echo "3b. ...OR keep a server up and probe it manually:"
+echo ""
+echo "      # shell 1: server in foreground (Ctrl-C to stop)"
+echo "      bash $SCRIPT_DIR/start_vllm_server.sh"
+echo ""
+echo "      # shell 2: once 'Application startup complete.' prints,"
+echo "      bash $SCRIPT_DIR/sanity_check.sh"
+echo "      CONCURRENCY=16 NUM_PROMPTS=128 bash $SCRIPT_DIR/run_bench_single.sh"
+echo ""
diff --git a/contrib/models/MiMo-V2.5/perf_test/bench_mimo_v2_5.sh b/contrib/models/MiMo-V2.5/perf_test/bench_mimo_v2_5.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+set -e
+
+# MiMo-V2.5 FP8 vLLM benchmark on Trn2. One-shot wrapper:
+#   launch server -> sanity check -> bench at c=1,16,32 -> stop server.
+#
+# This script composes three building blocks in perf_test/:
+#   start_vllm_server.sh  - server launch + env-var setup (backgrounded here)
+#   sanity_check.sh       - one-shot curl against the running server
+#   run_bench_single.sh   - one concurrency level of `vllm bench serve`
+#
+# Use those directly if you want to keep a long-running server and iterate
+# on bench parameters from another shell.
+#
+# Server recipe: TP=64, moe_tp=1/moe_ep=64, BS=32, continuous batching.
+# BS=32 is the smallest working batch size on the FP8 path (NxDI's TKG
+# path refuses Expert Parallelism with BS < num_experts/top_k = 256/8 = 32).
+# BS=1 single-stream latency demos are not currently supported on V2.5 FP8.
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PORT="${PORT:-8000}"
+RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5}"
+CONFIG_NAME="bs32_tp64_moetp1_ep64"
+
+mkdir -p "$RESULTS_DIR"
+
+# Wait for vLLM server to be ready. First-time compile of the 256-expert
+# MoE model takes ~30 min and can stretch past 2 h under contention, so
+# poll for up to 2 h.
+wait_for_server() {
+    echo "  Waiting for vLLM server on port $PORT (up to 2 h for first compile)..."
+    local interval=10
+    local max_attempts=720
+    local start=$SECONDS
+    for i in $(seq 1 $max_attempts); do
+        if curl -s "http://localhost:$PORT/health" > /dev/null 2>&1; then
+            echo "  Server ready after $((SECONDS - start))s."
+            return 0
+        fi
+        if [ $((i % 6)) -eq 0 ]; then
+            echo "    ...still waiting ($((SECONDS - start))s elapsed)"
+        fi
+        sleep $interval
+    done
+    echo "  ERROR: Server did not start within $((max_attempts * interval))s"
+    return 1
+}
+
+stop_server() {
+    echo "  Stopping vLLM server..."
+    pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
+    sleep 5
+}
+
+echo "=========================================="
+echo "MiMo-V2.5 FP8 Performance Benchmark"
+echo "=========================================="
+echo "Port:    $PORT"
+echo "Results: $RESULTS_DIR"
+echo ""
+
+# Start the server in the background. start_vllm_server.sh handles all the
+# env vars (MODEL_PATH, NEURON_COMPILED_ARTIFACTS, BASE_COMPILE_WORK_DIR,
+# contrib src registration, etc.) and execs `python3 -m vllm...`.
+bash "$SCRIPT_DIR/start_vllm_server.sh" &
+SERVER_PID=$!
+trap stop_server EXIT
+
+wait_for_server
+
+# One-shot sanity check (curl the chat endpoint).
+PORT="$PORT" bash "$SCRIPT_DIR/sanity_check.sh" || true
+
+# Three concurrency levels. run_bench_single.sh reads knobs from the
+# environment; see its header for all the options.
+PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
+    CONCURRENCY=1  NUM_PROMPTS=16  bash "$SCRIPT_DIR/run_bench_single.sh"
+PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
+    CONCURRENCY=16 NUM_PROMPTS=128 bash "$SCRIPT_DIR/run_bench_single.sh"
+PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
+    CONCURRENCY=32 NUM_PROMPTS=128 bash "$SCRIPT_DIR/run_bench_single.sh"
+
+echo "=========================================="
+echo "MiMo-V2.5 FP8 benchmark complete!"
+echo "Results saved to: $RESULTS_DIR"
+echo "=========================================="
+ls -la "$RESULTS_DIR"
diff --git a/contrib/models/MiMo-V2.5/perf_test/run_bench_single.sh b/contrib/models/MiMo-V2.5/perf_test/run_bench_single.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Run a single vllm-bench-serve pass against an already-running vLLM server.
+#
+# Unlike bench_mimo_v2_5.sh this script does NOT launch or kill the vLLM
+# server — you bring your own. That makes it convenient when the bench driver
+# in bench_mimo_v2_5.sh times out during first-time compilation: the server
+# keeps running, and once it's ready you can collect numbers with this.
+#
+# Usage:
+#   bash run_bench_single.sh                       # defaults: c=1, 16 prompts
+#   CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
+#   CONFIG_NAME=bs32_tp1_ep64_opt CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
+#
+# Environment knobs:
+#   PORT             vLLM server port (default 8000)
+#   MIMO_V2_5_PATH  Path to the Neuron-FP8 checkpoint (default
+#                    /opt/dlami/nvme/models/MiMo-V2.5-Neuron-FP8)
+#   CONCURRENCY      --max-concurrency (default 1)
+#   NUM_PROMPTS      --num-prompts (default 16)
+#   INPUT_LEN        --random-input-len (default 900)
+#   OUTPUT_LEN       --random-output-len (default 90)
+#   RANGE_RATIO      --random-range-ratio (default 0.03)
+#   CONFIG_NAME      Used in the output filename (default bs1_tp64_ep1)
+#   RESULTS_DIR      Where to dump per-run log (default /opt/dlami/nvme/logs/bench_results/mimo_v2_5)
+
+set -e
+
+source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate
+
+MODEL_PATH="${MIMO_V2_5_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Neuron-FP8}"
+PORT="${PORT:-8000}"
+CONCURRENCY="${CONCURRENCY:-1}"
+NUM_PROMPTS="${NUM_PROMPTS:-16}"
+INPUT_LEN="${INPUT_LEN:-900}"
+OUTPUT_LEN="${OUTPUT_LEN:-90}"
+RANGE_RATIO="${RANGE_RATIO:-0.03}"
+CONFIG_NAME="${CONFIG_NAME:-bs1_tp64_ep1}"
+RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5}"
+
+mkdir -p "$RESULTS_DIR"
+
+echo "=========================================="
+echo "MiMo-V2.5 single-run benchmark"
+echo "=========================================="
+echo "  Model:        $MODEL_PATH"
+echo "  Port:         $PORT"
+echo "  Config:       $CONFIG_NAME"
+echo "  Concurrency:  $CONCURRENCY"
+echo "  Prompts:      $NUM_PROMPTS"
+echo "  Input len:    $INPUT_LEN   Output len: $OUTPUT_LEN"
+echo "  Results:      $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
+echo ""
+
+# Quick health check
+if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
+    echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
+    echo "Start it first (e.g., bench_mimo_v2_5.sh) and wait until"
+    echo "'Application startup complete.' is printed."
+    exit 1
+fi
+
+vllm bench serve \
+    --backend vllm \
+    --model "$MODEL_PATH" \
+    --tokenizer "$MODEL_PATH" \
+    --endpoint /v1/completions \
+    --dataset-name random \
+    --num-prompts "$NUM_PROMPTS" \
+    --random-input-len "$INPUT_LEN" \
+    --random-output-len "$OUTPUT_LEN" \
+    --random-range-ratio "$RANGE_RATIO" \
+    --max-concurrency "$CONCURRENCY" \
+    2>&1 | tee "$RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
+
+echo ""
+echo "Saved to: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
diff --git a/contrib/models/MiMo-V2.5/perf_test/sanity_check.sh b/contrib/models/MiMo-V2.5/perf_test/sanity_check.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Quick sanity check against an already-running vLLM server.
+#
+# Assumes vLLM is already listening on $PORT (default 8000) with MiMo-V2.5
+# loaded. Sends a single chat completion and prints the model's reply.
+#
+# Usage:
+#   bash sanity_check.sh                      # uses defaults
+#   PORT=8001 bash sanity_check.sh            # custom port
+#   PROMPT="..." bash sanity_check.sh         # custom prompt
+
+set -e
+
+MODEL_PATH="${MIMO_V2_5_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Neuron-FP8}"
+PORT="${PORT:-8000}"
+PROMPT="${PROMPT:-What is 1+1? Answer briefly.}"
+MAX_TOKENS="${MAX_TOKENS:-64}"
+
+echo "Sanity check: POST /v1/chat/completions on port $PORT"
+echo "  Model:      $MODEL_PATH"
+echo "  Prompt:     $PROMPT"
+echo "  Max tokens: $MAX_TOKENS"
+echo ""
+
+# Health check first — fail fast if server isn't up.
+if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
+    echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
+    echo "Start it with 'bash bench_mimo_v2_5.sh' or your own launcher first."
+    exit 1
+fi
+
+RESPONSE=$(curl -s "http://localhost:$PORT/v1/chat/completions" \
+    -H 'Content-Type: application/json' \
+    -d "$(cat <<EOF
+{
+    "messages": [{"role": "user", "content": "$PROMPT"}],
+    "model": "$MODEL_PATH",
+    "max_tokens": $MAX_TOKENS,
+    "temperature": 0.0,
+    "stream": false
+}
+EOF
+)")
+
+echo "Response:"
+echo "$RESPONSE" | python3 -m json.tool 2>/dev/null || echo "$RESPONSE"
+echo ""
+
+# Extract the model's reply for a human-friendly one-liner summary.
+REPLY=$(echo "$RESPONSE" | python3 -c "
+import json, sys
+try:
+    r = json.load(sys.stdin)
+    print(r['choices'][0]['message']['content'].strip())
+except Exception as e:
+    print(f'(could not parse reply: {e})')
+" 2>/dev/null)
+
+echo "Model reply: $REPLY"