Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
82d7ead
[contrib] Add MiMo-V2.5-Pro initial port (copied from MiMo-V2-Pro, re…
whn09 Apr 27, 2026
e4a13c1
[contrib] MiMo-V2.5-Pro: apply review findings from HF reference diff
whn09 Apr 27, 2026
78b8e0a
[contrib] MiMo-V2.5-Pro: document current WIP status in README
whn09 Apr 28, 2026
e990f76
[contrib] MiMo-V2.5-Pro: correct FP8 root-cause framing in Status
whn09 Apr 28, 2026
f0d9c0b
[contrib] MiMo-V2.5-Pro: parallel preprocess + NVMe mount docs
whn09 Apr 28, 2026
b455a1e
[contrib] MiMo-V2.5-Pro: set AWS Llama-405B FP8 env vars in smoke scr…
whn09 Apr 28, 2026
9d29bbf
[contrib] MiMo-V2.5-Pro: fix tokenizer padding_side='left' for decode…
whn09 Apr 28, 2026
2a4c9ff
[contrib] MiMo-V2.5-Pro: skip FP8 quant on q/k/v_proj (attention path)
whn09 Apr 28, 2026
2b8b577
[contrib] MiMo-V2.5-Pro: try use_torch_block_wise + restore FP8 q/k/v
whn09 Apr 28, 2026
b3a8487
[contrib] MiMo-V2.5-Pro: wire up vLLM serving and record FP8 perf
whn09 Apr 29, 2026
bbb1e1f
[contrib] MiMo-V2.5-Pro: fix physical NC count (128, not 32)
whn09 Apr 29, 2026
d0eb413
[contrib] MiMo-V2.5-Pro: split vLLM launcher into start/bench/sanity …
whn09 Apr 29, 2026
2a181c1
[contrib] MiMo-V2.5-Pro: swap sanity_check default prompt to self-intro
whn09 Apr 29, 2026
82d5797
[contrib] MiMo-V2.5-Pro: BF16-attn recipe restores coherent output
whn09 Apr 29, 2026
cf60b91
[contrib] MiMo-V2.5-Pro: fold BF16 attn into preprocess; drop repatch
whn09 Apr 29, 2026
62fea53
[contrib] MiMo-V2.5-Pro: standardize on pytorch_inference_vllm_0_16 venv
whn09 Apr 29, 2026
e9ae094
[contrib] MiMo-V2.5-Pro: trim Status, reframe perf numbers as historical
whn09 Apr 29, 2026
d1ea946
[contrib] MiMo-V2.5-Pro: fix maintainer name typo
whn09 Apr 29, 2026
ad81f3f
[contrib] MiMo-V2.5-Pro: align start_vllm_server.sh with BF16-attn ckpt
whn09 Apr 29, 2026
5a297cb
[contrib] MiMo-V2.5-Pro: fit sanity + bench within seq_len=256
whn09 Apr 29, 2026
965a947
[contrib] MiMo-V2.5-Pro: sanity_check.sh — short system prompt, /v1/chat
whn09 Apr 29, 2026
935510a
[contrib] MiMo-V2.5-Pro: default vLLM to smoke NEFF (workaround)
whn09 Apr 29, 2026
6da7188
Revert "[contrib] MiMo-V2.5-Pro: default vLLM to smoke NEFF (workarou…
whn09 Apr 29, 2026
af27106
[contrib] MiMo-V2.5-Pro: bump default seq_len 256 -> 512; document vL…
whn09 Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
425 changes: 425 additions & 0 deletions contrib/models/MiMo-V2.5-Pro/README.md

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions contrib/models/MiMo-V2.5-Pro/perf_test/0_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
# Setup for MiMo-V2.5-Pro vLLM benchmarking on Trn2.
#
# This clones upstream vllm-project/vllm-neuron at release-0.5.0 (which already
# has the mimov2flash -> mimo_v2 model_type rewrite), then applies
# vllm-neuron-patch.patch to add a runtime registration hook so the contrib
# NeuronMiMoV2ForCausalLM is plugged into both NxDI's MODEL_TYPES and vLLM's
# ModelRegistry at vllm-neuron plugin init time.
set -e

echo "=========================================="
echo "Setup: vllm-neuron + MiMo-V2.5-Pro weights"
echo "=========================================="

source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate

PATCH_FILE="$(cd "$(dirname "$0")" && pwd)/vllm-neuron-patch.patch"

echo ""
echo "[1/2] Installing vllm-neuron (release-0.5.0) with the contrib registration patch..."

if [ ! -d $HOME/vllm-neuron ]; then
git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git $HOME/vllm-neuron
fi

cd $HOME/vllm-neuron

# Apply patch (idempotent via `git apply --check` first).
if git apply --check "$PATCH_FILE" 2>/dev/null; then
git apply "$PATCH_FILE"
echo " Applied $PATCH_FILE"
else
echo " Patch already applied or conflicts; continuing."
fi

pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e .
pip install s5cmd

python3 -c "import vllm_neuron; print('vllm-neuron installed:', vllm_neuron.__file__)"

echo ""
echo "[2/2] Downloading MiMo-V2.5-Pro Neuron-FP8 weights..."

MIMO_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8}"
if [ -d "$MIMO_PATH" ] && [ "$(ls "$MIMO_PATH"/*.safetensors 2>/dev/null | wc -l)" -gt 0 ]; then
echo " MiMo weights already exist at $MIMO_PATH, skipping download"
else
echo " Downloading Neuron-FP8 weights from your S3 bucket (edit the URI if needed)..."
mkdir -p "$MIMO_PATH"
s5cmd cp "s3://datalab/xiaomi/models/MiMo-V2.5-Pro-Neuron-FP8/**" "$MIMO_PATH/"
echo " Download complete: $(du -sh $MIMO_PATH | cut -f1)"
fi

# Figure out where this contrib package's src/ lives so the registration hook
# can add it to sys.path inside vllm-neuron.
CONTRIB_SRC="$(cd "$(dirname "$0")/.." && pwd)/src"

echo ""
echo "Setup complete. Before running the benchmark, export:"
echo " export MIMO_V2_FLASH_PATH=$MIMO_PATH"
echo " export NXDI_CONTRIB_MIMO_V2_FLASH_SRC=$CONTRIB_SRC"
87 changes: 87 additions & 0 deletions contrib/models/MiMo-V2.5-Pro/perf_test/bench_mimo_v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash
set -e

# MiMo-V2.5-Pro FP8 vLLM benchmark on Trn2. One-shot wrapper:
# launch server -> sanity check -> bench at c=1,16,48 -> stop server.
#
# This script composes three building blocks in perf_test/:
# start_vllm_server.sh - server launch + env-var setup (backgrounded here)
# sanity_check.sh - one-shot curl against the running server
# run_bench_single.sh - one concurrency level of `vllm bench serve`
#
# Use those directly if you want to keep a long-running server and iterate
# on bench parameters from another shell.
#
# Server recipe: TP=64, moe_tp=1/moe_ep=64, BS=48, continuous batching.
# BS=48 is the smallest working batch size on the FP8 path (NxDI's TKG
# path refuses Expert Parallelism with BS < num_experts/top_k = 384/8 = 48).
# BS=1 single-stream latency demos are not currently supported on Pro FP8.

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PORT="${PORT:-8000}"
RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5_pro}"
CONFIG_NAME="bs48_tp64_moetp1_ep64"

mkdir -p "$RESULTS_DIR"

# Wait for vLLM server to be ready. First-time compile of the 384-expert
# MoE model takes ~90 min and can stretch past 2 h under contention, so
# poll for up to 2 h.
wait_for_server() {
echo " Waiting for vLLM server on port $PORT (up to 2 h for first compile)..."
local interval=10
local max_attempts=720
local start=$SECONDS
for i in $(seq 1 $max_attempts); do
if curl -s "http://localhost:$PORT/health" > /dev/null 2>&1; then
echo " Server ready after $((SECONDS - start))s."
return 0
fi
if [ $((i % 6)) -eq 0 ]; then
echo " ...still waiting ($((SECONDS - start))s elapsed)"
fi
sleep $interval
done
echo " ERROR: Server did not start within $((max_attempts * interval))s"
return 1
}

stop_server() {
echo " Stopping vLLM server..."
pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
sleep 5
}

echo "=========================================="
echo "MiMo-V2.5-Pro FP8 Performance Benchmark"
echo "=========================================="
echo "Port: $PORT"
echo "Results: $RESULTS_DIR"
echo ""

# Start the server in the background. start_vllm_server.sh handles all the
# env vars (MODEL_PATH, NEURON_COMPILED_ARTIFACTS, BASE_COMPILE_WORK_DIR,
# contrib src registration, etc.) and execs `python3 -m vllm...`.
bash "$SCRIPT_DIR/start_vllm_server.sh" &
SERVER_PID=$!
trap stop_server EXIT

wait_for_server

# One-shot sanity check (curl the chat endpoint).
PORT="$PORT" bash "$SCRIPT_DIR/sanity_check.sh" || true

# Three concurrency levels. run_bench_single.sh reads knobs from the
# environment; see its header for all the options.
PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
CONCURRENCY=1 NUM_PROMPTS=16 bash "$SCRIPT_DIR/run_bench_single.sh"
PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
CONCURRENCY=16 NUM_PROMPTS=128 bash "$SCRIPT_DIR/run_bench_single.sh"
PORT="$PORT" RESULTS_DIR="$RESULTS_DIR" CONFIG_NAME="$CONFIG_NAME" \
CONCURRENCY=48 NUM_PROMPTS=192 bash "$SCRIPT_DIR/run_bench_single.sh"

echo "=========================================="
echo "MiMo-V2.5-Pro FP8 benchmark complete!"
echo "Results saved to: $RESULTS_DIR"
echo "=========================================="
ls -la "$RESULTS_DIR"
79 changes: 79 additions & 0 deletions contrib/models/MiMo-V2.5-Pro/perf_test/run_bench_single.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash
# Run a single vllm-bench-serve pass against an already-running vLLM server.
#
# Unlike bench_mimo_v2.sh this script does NOT launch or kill the vLLM
# server — you bring your own. That makes it convenient when the bench driver
# in bench_mimo_v2.sh times out during first-time compilation: the server
# keeps running, and once it's ready you can collect numbers with this.
#
# Usage:
# bash run_bench_single.sh # defaults: c=1, 16 prompts
# CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
# CONFIG_NAME=bs32_tp1_ep64_opt CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
#
# Environment knobs:
# PORT vLLM server port (default 8000)
# MIMO_V2_FLASH_PATH Path to the Neuron-FP8 checkpoint (default
# /opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8)
# CONCURRENCY --max-concurrency (default 1)
# NUM_PROMPTS --num-prompts (default 16)
# INPUT_LEN --random-input-len (default 180; matches seq_len=256)
# OUTPUT_LEN --random-output-len (default 60; matches seq_len=256)
# RANGE_RATIO --random-range-ratio (default 0.03)
# CONFIG_NAME Used in the output filename (default bs48_tp64_moetp1_ep64)
# RESULTS_DIR Where to dump per-run log
# (default /opt/dlami/nvme/logs/bench_results/mimo_v2_5_pro)

set -e

source /opt/aws_neuronx_venv_pytorch_inference_vllm_0_16/bin/activate

MODEL_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8}"
PORT="${PORT:-8000}"
CONCURRENCY="${CONCURRENCY:-1}"
NUM_PROMPTS="${NUM_PROMPTS:-16}"
INPUT_LEN="${INPUT_LEN:-180}"
OUTPUT_LEN="${OUTPUT_LEN:-60}"
RANGE_RATIO="${RANGE_RATIO:-0.03}"
# seq_len=256 on the compiled server, so input+output must stay under 256.
# Default 180+60=240 leaves a small margin for random-range-ratio expansion.
CONFIG_NAME="${CONFIG_NAME:-bs48_tp64_moetp1_ep64}"
RESULTS_DIR="${RESULTS_DIR:-/opt/dlami/nvme/logs/bench_results/mimo_v2_5_pro}"

mkdir -p "$RESULTS_DIR"

echo "=========================================="
echo "MiMo-V2.5-Pro single-run benchmark"
echo "=========================================="
echo " Model: $MODEL_PATH"
echo " Port: $PORT"
echo " Config: $CONFIG_NAME"
echo " Concurrency: $CONCURRENCY"
echo " Prompts: $NUM_PROMPTS"
echo " Input len: $INPUT_LEN Output len: $OUTPUT_LEN"
echo " Results: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
echo ""

# Quick health check
if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
echo "Start it first (e.g., bench_mimo_v2.sh) and wait until"
echo "'Application startup complete.' is printed."
exit 1
fi

vllm bench serve \
--backend vllm \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--endpoint /v1/completions \
--dataset-name random \
--num-prompts "$NUM_PROMPTS" \
--random-input-len "$INPUT_LEN" \
--random-output-len "$OUTPUT_LEN" \
--random-range-ratio "$RANGE_RATIO" \
--max-concurrency "$CONCURRENCY" \
2>&1 | tee "$RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"

echo ""
echo "Saved to: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
88 changes: 88 additions & 0 deletions contrib/models/MiMo-V2.5-Pro/perf_test/sanity_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash
# Quick sanity check against an already-running vLLM server.
#
# Posts a chat request to /v1/completions and prints the reply.
#
# Pro's default chat template prepends a ~240-token system prompt that by
# itself overflows the seq_len=256 compile-time bucket, so we send an
# explicit short system message — apply_chat_template then uses ours
# instead of the default and the whole prompt fits in ~25 tokens.
#
# Usage:
# bash sanity_check.sh # uses defaults
# PORT=8001 bash sanity_check.sh # custom port
# PROMPT="..." bash sanity_check.sh # custom user content
# SYSTEM="..." bash sanity_check.sh # custom system message

set -e

MODEL_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2.5-Pro-Neuron-FP8}"
PORT="${PORT:-8000}"
# Short system message (keeps total prompt ~25 tokens) — the checkpoint's
# default system prompt is ~240 tokens and would overflow seq_len=256.
SYSTEM="${SYSTEM:-You are MiMo, a helpful assistant developed by Xiaomi.}"
# "Introduce yourself" is the self-identification prompt that consistently
# lands in the model's MiMo-aware region. Swap PROMPT=... to probe others.
PROMPT="${PROMPT:-Hello! Please introduce yourself in one sentence.}"
MAX_TOKENS="${MAX_TOKENS:-80}"

echo "Sanity check: POST /v1/chat/completions on port $PORT"
echo " Model: $MODEL_PATH"
echo " System: $SYSTEM"
echo " Prompt: $PROMPT"
echo " Max tokens: $MAX_TOKENS"
echo ""

# Health check first — fail fast if server isn't up.
if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
echo "Start it with 'bash start_vllm_server.sh' (or bench_mimo_v2.sh)"
echo "first and wait for 'Application startup complete.'"
exit 1
fi

# NOTE: request-side `temperature` / `top_k` / `top_p` are ignored by
# vllm-neuron on this model: the on_device_sampling_config baked into the
# NEFF at compile time wins. Output is always stochastic; re-run to see
# variance, or restart the server with `do_sample=false` in
# start_vllm_server.sh to force deterministic greedy decoding.
python3 <<PYEOF
import json
import sys
import urllib.error
import urllib.request

model = "$MODEL_PATH"
system = """$SYSTEM"""
user = """$PROMPT"""
body = json.dumps({
"model": model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"max_tokens": int("$MAX_TOKENS"),
"stream": False,
}).encode()
req = urllib.request.Request(
"http://localhost:$PORT/v1/chat/completions",
data=body,
headers={"Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=120) as r:
resp = json.load(r)
except urllib.error.HTTPError as e:
print("HTTP error:", e.code, e.read().decode(errors="replace"))
sys.exit(1)

if "error" in resp:
print("Error from server:", json.dumps(resp["error"], indent=2))
sys.exit(1)

text = resp["choices"][0]["message"]["content"]
print("Response:")
print(text)
print()
print("Usage:", resp.get("usage", {}))
PYEOF
Loading