aws-neuron · whn09 · Apr 27, 2026 · Apr 27, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/contrib/models/MiMo-V2.5-Pro/README.md b/contrib/models/MiMo-V2.5-Pro/README.md
diff --git a/contrib/models/MiMo-V2.5-Pro/perf_test/0_setup.sh b/contrib/models/MiMo-V2.5-Pro/perf_test/0_setup.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Setup for MiMo-V2.5-Pro vLLM benchmarking on Trn2.
+#
+# This clones upstream vllm-project/vllm-neuron at release-0.5.0 (which already
+# has the mimov2flash -> mimo_v2 model_type rewrite), then applies
+# vllm-neuron-patch.patch to add a runtime registration hook so the contrib
+# NeuronMiMoV2ForCausalLM is plugged into both NxDI's MODEL_TYPES and vLLM's
+# ModelRegistry at vllm-neuron plugin init time.
+set -e
+
+echo "=========================================="
+echo "Setup: vllm-neuron + MiMo-V2.5-Pro weights"
+echo "=========================================="
+
+source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate
+
+PATCH_FILE="$(cd "$(dirname "$0")" && pwd)/vllm-neuron-patch.patch"
+
+echo ""
+echo "[1/2] Installing vllm-neuron (release-0.5.0) with the contrib registration patch..."
+
+if [ ! -d $HOME/vllm-neuron ]; then
+    git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git $HOME/vllm-neuron
+fi
+
+cd $HOME/vllm-neuron
+
+# Apply patch (idempotent via `git apply --check` first).
+if git apply --check "$PATCH_FILE" 2>/dev/null; then
+    git apply "$PATCH_FILE"
+    echo "  Applied $PATCH_FILE"
+else
+    echo "  Patch already applied or conflicts; continuing."
+fi
+
+pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e .
+pip install s5cmd
+
+python3 -c "import vllm_neuron; print('vllm-neuron installed:', vllm_neuron.__file__)"
+
+echo ""
+echo "[2/2] Downloading MiMo-V2.5-Pro Neuron-FP8 weights..."
+
+MIMO_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8}"
+if [ -d "$MIMO_PATH" ] && [ "$(ls "$MIMO_PATH"/*.safetensors 2>/dev/null | wc -l)" -gt 0 ]; then
+    echo "  MiMo weights already exist at $MIMO_PATH, skipping download"
+else
+    echo "  Downloading Neuron-FP8 weights from your S3 bucket (edit the URI if needed)..."
+    mkdir -p "$MIMO_PATH"
+    s5cmd cp "s3://datalab/xiaomi/models/MiMo-V2.5-Pro-Neuron-FP8/**" "$MIMO_PATH/"
+    echo "  Download complete: $(du -sh $MIMO_PATH | cut -f1)"
+fi
+
+# Figure out where this contrib package's src/ lives so the registration hook
+# can add it to sys.path inside vllm-neuron.
+CONTRIB_SRC="$(cd "$(dirname "$0")/.." && pwd)/src"
+
+echo ""
+echo "Setup complete. Before running the benchmark, export:"
+echo "  export MIMO_V2_FLASH_PATH=$MIMO_PATH"
+echo "  export NXDI_CONTRIB_MIMO_V2_FLASH_SRC=$CONTRIB_SRC"
diff --git a/contrib/models/MiMo-V2.5-Pro/perf_test/bench_mimo_v2.sh b/contrib/models/MiMo-V2.5-Pro/perf_test/bench_mimo_v2.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+set -e
+
+# MiMo-V2.5-Pro FP8 vLLM benchmark on Trn2. One-shot wrapper:
+#   launch server -> sanity check -> bench at c=1,16,48 -> stop server.
+#
+# This script composes three building blocks in perf_test/:
+#   start_vllm_server.sh  - server launch + env-var setup (backgrounded here)
+#   sanity_check.sh       - one-shot curl against the running server
+#   run_bench_single.sh   - one concurrency level of `vllm bench serve`
+#
+# Use those directly if you want to keep a long-running server and iterate
+# on bench parameters from another shell.
+#
+# Server recipe: TP=64, moe_tp=1/moe_ep=64, BS=48, continuous batching.
+# BS=48 is the smallest working batch size on the FP8 path (NxDI's TKG
+# path refuses Expert Parallelism with BS < num_experts/top_k = 384/8 = 48).
+# BS=1 single-stream latency demos are not currently supported on Pro FP8.
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PORT="${PORT:-8000}"
+RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5_pro}"
+CONFIG_NAME="bs48_tp64_moetp1_ep64"
+
+mkdir -p "$RESULTS_DIR"
+
+# Wait for vLLM server to be ready. First-time compile of the 384-expert
+# MoE model takes ~90 min and can stretch past 2 h under contention, so
+# poll for up to 2 h.
+wait_for_server() {
+    echo "  Waiting for vLLM server on port $PORT (up to 2 h for first compile)..."
+    local interval=10
+    local max_attempts=720
+    local start=$SECONDS
+    for i in $(seq 1 $max_attempts); do
+        if curl -s "http://localhost:$PORT/health" > /dev/null 2>&1; then
+            echo "  Server ready after $((SECONDS - start))s."
+            return 0
+        fi
+        if [ $((i % 6)) -eq 0 ]; then
+            echo "    ...still waiting ($((SECONDS - start))s elapsed)"
+        fi
+        sleep $interval
+    done
+    echo "  ERROR: Server did not start within $((max_attempts * interval))s"
+    return 1
+}
+
+stop_server() {
+    echo "  Stopping vLLM server..."
+    pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
+    sleep 5
+}
+
+echo "=========================================="
+echo "MiMo-V2.5-Pro FP8 Performance Benchmark"
+echo "=========================================="
+echo "Port:    $PORT"
+echo "Results: $RESULTS_DIR"
+echo ""
+
+# Start the server in the background. start_vllm_server.sh handles all the
+# env vars (MODEL_PATH, NEURON_COMPILED_ARTIFACTS, BASE_COMPILE_WORK_DIR,
+# contrib src registration, etc.) and execs `python3 -m vllm...`.
+bash "$SCRIPT_DIR/start_vllm_server.sh" &
+SERVER_PID=$!
+trap stop_server EXIT
+
+wait_for_server
+
+# One-shot sanity check (curl the chat endpoint).
+PORT="$PORT" bash "$SCRIPT_DIR/sanity_check.sh" || true
+
+# Three concurrency levels. run_bench_single.sh reads knobs from the
+# environment; see its header for all the options.
+PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
+    CONCURRENCY=1  NUM_PROMPTS=16  bash "$SCRIPT_DIR/run_bench_single.sh"
+PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
+    CONCURRENCY=16 NUM_PROMPTS=128 bash "$SCRIPT_DIR/run_bench_single.sh"
+PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
+    CONCURRENCY=48 NUM_PROMPTS=192 bash "$SCRIPT_DIR/run_bench_single.sh"
+
+echo "=========================================="
+echo "MiMo-V2.5-Pro FP8 benchmark complete!"
+echo "Results saved to: $RESULTS_DIR"
+echo "=========================================="
+ls -la "$RESULTS_DIR"
diff --git a/contrib/models/MiMo-V2.5-Pro/perf_test/run_bench_single.sh b/contrib/models/MiMo-V2.5-Pro/perf_test/run_bench_single.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Run a single vllm-bench-serve pass against an already-running vLLM server.
+#
+# Unlike bench_mimo_v2.sh this script does NOT launch or kill the vLLM
+# server — you bring your own. That makes it convenient when the bench driver
+# in bench_mimo_v2.sh times out during first-time compilation: the server
+# keeps running, and once it's ready you can collect numbers with this.
+#
+# Usage:
+#   bash run_bench_single.sh                       # defaults: c=1, 16 prompts
+#   CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
+#   CONFIG_NAME=bs32_tp1_ep64_opt CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
+#
+# Environment knobs:
+#   PORT             vLLM server port (default 8000)
+#   MIMO_V2_FLASH_PATH  Path to the Neuron-FP8 checkpoint (default
+#                    /opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8)
+#   CONCURRENCY      --max-concurrency (default 1)
+#   NUM_PROMPTS      --num-prompts (default 16)
+#   INPUT_LEN        --random-input-len (default 180; matches seq_len=256)
+#   OUTPUT_LEN       --random-output-len (default 60; matches seq_len=256)
+#   RANGE_RATIO      --random-range-ratio (default 0.03)
+#   CONFIG_NAME      Used in the output filename (default bs48_tp64_moetp1_ep64)
+#   RESULTS_DIR      Where to dump per-run log
+#                    (default /opt/dlami/nvme/logs/bench_results/mimo_v2_5_pro)
+
+set -e
+
+source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate
+
+MODEL_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8}"
+PORT="${PORT:-8000}"
+CONCURRENCY="${CONCURRENCY:-1}"
+NUM_PROMPTS="${NUM_PROMPTS:-16}"
+INPUT_LEN="${INPUT_LEN:-180}"
+OUTPUT_LEN="${OUTPUT_LEN:-60}"
+RANGE_RATIO="${RANGE_RATIO:-0.03}"
+# seq_len=256 on the compiled server, so input+output must stay under 256.
+# Default 180+60=240 leaves a small margin for random-range-ratio expansion.
+CONFIG_NAME="${CONFIG_NAME:-bs48_tp64_moetp1_ep64}"
+RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5_pro}"
+
+mkdir -p "$RESULTS_DIR"
+
+echo "=========================================="
+echo "MiMo-V2.5-Pro single-run benchmark"
+echo "=========================================="
+echo "  Model:        $MODEL_PATH"
+echo "  Port:         $PORT"
+echo "  Config:       $CONFIG_NAME"
+echo "  Concurrency:  $CONCURRENCY"
+echo "  Prompts:      $NUM_PROMPTS"
+echo "  Input len:    $INPUT_LEN   Output len: $OUTPUT_LEN"
+echo "  Results:      $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
+echo ""
+
+# Quick health check
+if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
+    echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
+    echo "Start it first (e.g., bench_mimo_v2.sh) and wait until"
+    echo "'Application startup complete.' is printed."
+    exit 1
+fi
+
+vllm bench serve \
+    --backend vllm \
+    --model "$MODEL_PATH" \
+    --tokenizer "$MODEL_PATH" \
+    --endpoint /v1/completions \
+    --dataset-name random \
+    --num-prompts "$NUM_PROMPTS" \
+    --random-input-len "$INPUT_LEN" \
+    --random-output-len "$OUTPUT_LEN" \
+    --random-range-ratio "$RANGE_RATIO" \
+    --max-concurrency "$CONCURRENCY" \
+    2>&1 | tee "$RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
+
+echo ""
+echo "Saved to: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
diff --git a/contrib/models/MiMo-V2.5-Pro/perf_test/sanity_check.sh b/contrib/models/MiMo-V2.5-Pro/perf_test/sanity_check.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# Quick sanity check against an already-running vLLM server.
+#
+# Posts a chat request to /v1/completions and prints the reply.
+#
+# Pro's default chat template prepends a ~240-token system prompt that by
+# itself overflows the seq_len=256 compile-time bucket, so we send an
+# explicit short system message — apply_chat_template then uses ours
+# instead of the default and the whole prompt fits in ~25 tokens.
+#
+# Usage:
+#   bash sanity_check.sh                      # uses defaults
+#   PORT=8001 bash sanity_check.sh            # custom port
+#   PROMPT="..." bash sanity_check.sh         # custom user content
+#   SYSTEM="..." bash sanity_check.sh         # custom system message
+
+set -e
+
+MODEL_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8}"
+PORT="${PORT:-8000}"
+# Short system message (keeps total prompt ~25 tokens) — the checkpoint's
+# default system prompt is ~240 tokens and would overflow seq_len=256.
+SYSTEM="${SYSTEM:-You are MiMo, a helpful assistant developed by Xiaomi.}"
+# "Introduce yourself" is the self-identification prompt that consistently
+# lands in the model's MiMo-aware region. Swap PROMPT=... to probe others.
+PROMPT="${PROMPT:-Hello! Please introduce yourself in one sentence.}"
+MAX_TOKENS="${MAX_TOKENS:-80}"
+
+echo "Sanity check: POST /v1/chat/completions on port $PORT"
+echo "  Model:      $MODEL_PATH"
+echo "  System:     $SYSTEM"
+echo "  Prompt:     $PROMPT"
+echo "  Max tokens: $MAX_TOKENS"
+echo ""
+
+# Health check first — fail fast if server isn't up.
+if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
+    echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
+    echo "Start it with 'bash start_vllm_server.sh' (or bench_mimo_v2.sh)"
+    echo "first and wait for 'Application startup complete.'"
+    exit 1
+fi
+
+# NOTE: request-side `temperature` / `top_k` / `top_p` are ignored by
+# vllm-neuron on this model: the on_device_sampling_config baked into the
+# NEFF at compile time wins. Output is always stochastic; re-run to see
+# variance, or restart the server with `do_sample=false` in
+# start_vllm_server.sh to force deterministic greedy decoding.
+python3 <<PYEOF
+import json
+import sys
+import urllib.error
+import urllib.request
+
+model = "$MODEL_PATH"
+system = """$SYSTEM"""
+user = """$PROMPT"""
+body = json.dumps({
+    "model": model,
+    "messages": [
+        {"role": "system", "content": system},
+        {"role": "user", "content": user},
+    ],
+    "max_tokens": int("$MAX_TOKENS"),
+    "stream": False,
+}).encode()
+req = urllib.request.Request(
+    "http://localhost:$PORT/v1/chat/completions",
+    data=body,
+    headers={"Content-Type": "application/json"},
+)
+try:
+    with urllib.request.urlopen(req, timeout=120) as r:
+        resp = json.load(r)
+except urllib.error.HTTPError as e:
+    print("HTTP error:", e.code, e.read().decode(errors="replace"))
+    sys.exit(1)
+
+if "error" in resp:
+    print("Error from server:", json.dumps(resp["error"], indent=2))
+    sys.exit(1)
+
+text = resp["choices"][0]["message"]["content"]
+print("Response:")
+print(text)
+print()
+print("Usage:", resp.get("usage", {}))
+PYEOF