Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ac186ab
[contrib] Add MiMo-V2-Flash NeuronX port (TP=64, EP=64 MoE)
whn09 Apr 22, 2026
88e247f
Switch vllm-neuron patch to a runtime registration hook on release-0.5.0
whn09 Apr 22, 2026
d40f579
vllm-neuron patch: move registration into _get_neuron_model_cls
whn09 Apr 22, 2026
83fb9e9
vllm-neuron patch: also monkey-patch AutoConfig for trust_remote_code
whn09 Apr 22, 2026
4b85816
bench: set use_torch_block_wise=true to avoid missing NKI kernel
whn09 Apr 22, 2026
d95207c
bench: extend wait_for_server timeout to 2h for MoE first-compile
whn09 Apr 22, 2026
f669f1c
perf_test: add sanity_check.sh and run_bench_single.sh helpers
whn09 Apr 22, 2026
42420cb
Add streaming FP8 preprocess script for MiMo-V2-Flash
whn09 Apr 23, 2026
2fbbdca
Enable FP8 inference for MiMo-V2-Flash (BF16 path unchanged)
whn09 Apr 23, 2026
ec2aa40
Add FP8 smoke tests and save_sharded_checkpoint to bench
whn09 Apr 23, 2026
d7ac76e
perf_test/0_setup: clone vllm-neuron into \$HOME not /tmp
whn09 Apr 24, 2026
1259a1a
Apply attention_value_scale to value_states (matches HF reference)
whn09 Apr 24, 2026
1655dc9
Install FP8 monkey-patches in __init__ (belt-and-braces)
whn09 Apr 24, 2026
b9eea11
WIP: symmetric K/V head_dim via preprocess-side V padding
whn09 Apr 24, 2026
5b77f81
Revert "WIP: symmetric K/V head_dim via preprocess-side V padding"
whn09 Apr 24, 2026
13d855e
smoke: isolate BASE_COMPILE_WORK_DIR per COMPILED_PATH
whn09 Apr 24, 2026
1d3916a
Router bias: use arange + bf16 to survive XLA constant folding
whn09 Apr 24, 2026
30a30d3
smoke: pin outer ep_degree=1 (only moe_ep_degree varies)
whn09 Apr 25, 2026
d3c1c96
MoE scale expansion: use moe_tp_degree, not tp_degree
whn09 Apr 25, 2026
b9aaa74
smoke: default to moe_tp=1 / moe_ep=64 for correct FP8 output
whn09 Apr 25, 2026
305279f
bench: rewrite for FP8 recipe with moe_tp=1/moe_ep=64
whn09 Apr 25, 2026
21aca48
README: document the FP8 recipe and its constraints
whn09 Apr 25, 2026
c303b52
README: add Quick Start section for FP8 reproduction
whn09 Apr 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
364 changes: 364 additions & 0 deletions contrib/models/MiMo-V2-Flash/README.md

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions contrib/models/MiMo-V2-Flash/perf_test/0_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
# Setup for MiMo-V2-Flash vLLM benchmarking on Trn2.
#
# This clones upstream vllm-project/vllm-neuron at release-0.5.0 (which already
# has the mimov2flash -> mimo_v2_flash model_type rewrite), then applies
# vllm-neuron-patch.patch to add a runtime registration hook so the contrib
# NeuronMiMoV2ForCausalLM is plugged into both NxDI's MODEL_TYPES and vLLM's
# ModelRegistry at vllm-neuron plugin init time.
set -e

echo "=========================================="
echo "Setup: vllm-neuron + MiMo-V2-Flash weights"
echo "=========================================="

source /opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/bin/activate

PATCH_FILE="$(cd "$(dirname "$0")" && pwd)/vllm-neuron-patch.patch"

echo ""
echo "[1/2] Installing vllm-neuron (release-0.5.0) with the contrib registration patch..."

if [ ! -d $HOME/vllm-neuron ]; then
git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git $HOME/vllm-neuron
fi

cd $HOME/vllm-neuron

# Apply patch (idempotent via `git apply --check` first).
if git apply --check "$PATCH_FILE" 2>/dev/null; then
git apply "$PATCH_FILE"
echo " Applied $PATCH_FILE"
else
echo " Patch already applied or conflicts; continuing."
fi

pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e .
pip install s5cmd

python3 -c "import vllm_neuron; print('vllm-neuron installed:', vllm_neuron.__file__)"

echo ""
echo "[2/2] Downloading MiMo-V2-Flash BF16 weights..."

MIMO_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2-Flash-BF16}"
if [ -d "$MIMO_PATH" ] && [ "$(ls "$MIMO_PATH"/*.safetensors 2>/dev/null | wc -l)" -gt 0 ]; then
echo " MiMo weights already exist at $MIMO_PATH, skipping download"
else
echo " Downloading BF16 weights from your S3 bucket (edit the URI if needed)..."
mkdir -p "$MIMO_PATH"
s5cmd cp "s3://datalab/xiaomi/models/MiMo-V2-Flash-BF16/**" "$MIMO_PATH/"
echo " Download complete: $(du -sh $MIMO_PATH | cut -f1)"
fi

# Figure out where this contrib package's src/ lives so the registration hook
# can add it to sys.path inside vllm-neuron.
CONTRIB_SRC="$(cd "$(dirname "$0")/.." && pwd)/src"

echo ""
echo "Setup complete. Before running the benchmark, export:"
echo " export MIMO_V2_FLASH_PATH=$MIMO_PATH"
echo " export NXDI_CONTRIB_MIMO_V2_FLASH_SRC=$CONTRIB_SRC"
228 changes: 228 additions & 0 deletions contrib/models/MiMo-V2-Flash/perf_test/bench_mimo_v2_flash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#!/bin/bash
set -e

# MiMo-V2-Flash FP8 vLLM benchmark on Trn2.
#
# Requires a Neuron-FP8 preprocessed checkpoint (see
# `src/conversion_script/preprocess_mimo_v2_flash_fp8.py`). The configs below
# all use moe_tp_degree=1 / moe_ep_degree=64 (experts sharded by expert
# parallelism only, no intra-expert TP split) because moe_tp_degree=64 collapses
# the per-rank FP8 blockwise scale to a singleton — per-rank expert
# intermediate is 32 rows, below the 128-row blockwise block, so
# NxDI's `_setup_for_scale` drops per-channel scale granularity. The resulting
# drift compounds across 47 MoE layers and gives repetition / output collapse.
# Using moe_ep_degree=64 keeps all of each expert's weight + scale on one rank
# (4 experts per rank), which preserves the blockwise scale intact.
#
# NxDI's TKG path refuses Expert Parallelism with BS < num_experts/top_k
# (256 / 8 = 32 for Flash), so the smallest working batch size here is 32.
# If you want BS=1 behaviour, the FP8 path is not currently supported on
# this model on Trn2 — use the BF16 checkpoint with the old bench recipe
# (`moe_tp_degree=64, moe_ep_degree=1, batch_size=1`).

source /opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/bin/activate

MODEL_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2-Flash-Neuron-FP8}"
# The NxDI contrib MiMo-V2-Flash modeling code is registered into vLLM /
# NxDI lookup tables by vllm-neuron's register() hook using this env var.
# Default to this contrib package's own src/ relative to the script.
: "${NXDI_CONTRIB_MIMO_V2_FLASH_SRC:=$(cd "$(dirname "$0")/.." && pwd)/src}"
export NXDI_CONTRIB_MIMO_V2_FLASH_SRC

# First-time Flash FP8 compile takes 30-60 minutes; extend vLLM's ready
# timeout and the compiler's environment variables for FP8 numerics.
export VLLM_ENGINE_READY_TIMEOUT_S=7200

PORT=8000
RESULTS_DIR="/tmp/bench_results/mimo_v2_flash"
mkdir -p "$RESULTS_DIR"

# Common neuron config shared across all MiMo-V2-Flash FP8 configs.
# save_sharded_checkpoint=true persists per-rank sharded weights to
# <compiled-path>/weights/tp{N}_sharded_checkpoint.safetensors during compile;
# load() then reads those directly (~30s) instead of re-sharding the entire
# checkpoint on every vllm-neuron startup (~10+ min).
COMMON_MIMO_CONFIG='"tp_degree": 64,
"logical_nc_config": 2,
"fused_qkv": false,
"sequence_parallel_enabled": false,
"glu_mlp": true,
"normalize_top_k_affinities": true,
"save_sharded_checkpoint": true,
"router_config": {"act_fn": "sigmoid", "dtype": "float32"},
"quantized": true,
"quantized_checkpoints_path": "'"$MODEL_PATH"'",
"quantization_dtype": "f8e4m3",
"quantization_type": "blockwise_symmetric",
"quantization_block_axis": [1, 2],
"quantization_block_size": [128, 128],
"modules_to_not_convert": ["embed_tokens", "lm_head", "norm", "router", "o_proj"],
"blockwise_matmul_config": {"use_shard_on_block_dynamic_while": true, "block_sharding_strategy": "PING_PONG"}'

# Helper: wait for vLLM server to be ready. First-time compilation of a
# 256-expert MoE model takes 30-90 minutes, so we poll for up to 2 hours.
wait_for_server() {
echo " Waiting for vLLM server to be ready (up to 2h for first compile)..."
local interval=10
local max_attempts=720 # 720 * 10s = 7200s = 2h
local start=$SECONDS
for i in $(seq 1 $max_attempts); do
if curl -s http://localhost:$PORT/health > /dev/null 2>&1; then
echo " Server ready! (waited $((SECONDS - start))s)"
return 0
fi
# Show a progress blip every minute so the user knows we're alive
if [ $((i % 6)) -eq 0 ]; then
echo " ...still waiting ($((SECONDS - start))s elapsed)"
fi
sleep $interval
done
echo " ERROR: Server did not start within $((max_attempts * interval))s"
return 1
}

# Helper: run benchmark
run_bench() {
local config_name=$1
local concurrency=$2
local num_prompts=$3

echo " Benchmark: concurrency=$concurrency, prompts=$num_prompts"
vllm bench serve \
--backend vllm \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--endpoint /v1/completions \
--dataset-name random \
--num-prompts "$num_prompts" \
--random-input-len 900 \
--random-output-len 90 \
--random-range-ratio 0.03 \
--max-concurrency "$concurrency" \
2>&1 | tee "$RESULTS_DIR/${config_name}_c${concurrency}.txt"
echo ""
}

# Helper: stop server
stop_server() {
echo " Stopping vLLM server..."
pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
sleep 5
}

# Helper: quick sanity check
sanity_check() {
echo " Running sanity check..."
curl -s http://localhost:$PORT/v1/chat/completions \
-H 'Content-Type: application/json' \
-d '{
"messages": [{"role": "user", "content": "What is 1+1? Answer briefly."}],
"model": "'"$MODEL_PATH"'",
"max_tokens": 64,
"temperature": 0.0,
"stream": false
}' | python3 -c "import sys,json; r=json.load(sys.stdin); print(' Sanity:', r['choices'][0]['message']['content'][:100])" 2>/dev/null || echo " Sanity check: could not parse response"
}

echo "=========================================="
echo "MiMo-V2-Flash FP8 Performance Benchmark"
echo "=========================================="
echo "Model: $MODEL_PATH"
echo "Results: $RESULTS_DIR"
echo ""

###############################################################################
# Config 1: BS=32, TP=64 + moe_tp=1/moe_ep=64, CB + bucketing (smallest BS
# that satisfies NxDI's Expert-Parallel BS >= num_experts/top_k requirement).
###############################################################################
CONFIG_NAME="bs32_tp64_moetp1_ep64"
echo "--- Config 1: BS=32, moe_tp=1/moe_ep=64, CB + bucketing ---"

python3 -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--tensor-parallel-size 64 \
--max-model-len 1024 \
--max-num-seqs 32 \
--no-enable-chunked-prefill \
--no-enable-prefix-caching \
--port $PORT \
--trust_remote_code \
--additional-config '{
"override_neuron_config": {
'"$COMMON_MIMO_CONFIG"',
"moe_tp_degree": 1,
"moe_ep_degree": 64,
"batch_size": 32,
"ctx_batch_size": 1,
"tkg_batch_size": 32,
"max_context_length": 1024,
"seq_len": 1024,
"is_continuous_batching": true,
"enable_bucketing": true,
"context_encoding_buckets": [1024],
"token_generation_buckets": [1024],
"async_mode": true,
"on_device_sampling_config": {
"do_sample": true, "temperature": 0.6, "top_k": 20, "top_p": 0.95
}
}
}' &

wait_for_server
sanity_check
run_bench "$CONFIG_NAME" 1 16
run_bench "$CONFIG_NAME" 16 128
run_bench "$CONFIG_NAME" 32 128
stop_server

###############################################################################
# Config 2: BS=128, TP=64 + moe_tp=1/moe_ep=64, CB + bucketing (throughput).
###############################################################################
CONFIG_NAME="bs128_tp64_moetp1_ep64"
echo "--- Config 2: BS=128, moe_tp=1/moe_ep=64, CB + bucketing ---"

python3 -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--tensor-parallel-size 64 \
--max-model-len 1024 \
--max-num-seqs 128 \
--no-enable-chunked-prefill \
--no-enable-prefix-caching \
--port $PORT \
--trust_remote_code \
--additional-config '{
"override_neuron_config": {
'"$COMMON_MIMO_CONFIG"',
"moe_tp_degree": 1,
"moe_ep_degree": 64,
"batch_size": 128,
"ctx_batch_size": 1,
"tkg_batch_size": 128,
"max_context_length": 1024,
"seq_len": 1024,
"is_continuous_batching": true,
"enable_bucketing": true,
"context_encoding_buckets": [1024],
"token_generation_buckets": [1024],
"async_mode": true,
"on_device_sampling_config": {
"do_sample": true, "temperature": 0.6, "top_k": 20, "top_p": 0.95
}
}
}' &

wait_for_server
sanity_check
run_bench "$CONFIG_NAME" 1 16
run_bench "$CONFIG_NAME" 16 128
run_bench "$CONFIG_NAME" 32 128
run_bench "$CONFIG_NAME" 128 512
stop_server

echo "=========================================="
echo "MiMo-V2-Flash FP8 benchmarks complete!"
echo "Results saved to: $RESULTS_DIR"
echo "=========================================="
ls -la "$RESULTS_DIR"
76 changes: 76 additions & 0 deletions contrib/models/MiMo-V2-Flash/perf_test/run_bench_single.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash
# Run a single vllm-bench-serve pass against an already-running vLLM server.
#
# Unlike bench_mimo_v2_flash.sh this script does NOT launch or kill the vLLM
# server — you bring your own. That makes it convenient when the bench driver
# in bench_mimo_v2_flash.sh times out during first-time compilation: the server
# keeps running, and once it's ready you can collect numbers with this.
#
# Usage:
# bash run_bench_single.sh # defaults: c=1, 16 prompts
# CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
# CONFIG_NAME=bs32_tp1_ep64_opt CONCURRENCY=16 NUM_PROMPTS=128 bash run_bench_single.sh
#
# Environment knobs:
# PORT vLLM server port (default 8000)
# MIMO_V2_FLASH_PATH Path to the BF16 checkpoint (default
# /opt/dlami/nvme/models/MiMo-V2-Flash-BF16)
# CONCURRENCY --max-concurrency (default 1)
# NUM_PROMPTS --num-prompts (default 16)
# INPUT_LEN --random-input-len (default 900)
# OUTPUT_LEN --random-output-len (default 90)
# RANGE_RATIO --random-range-ratio (default 0.03)
# CONFIG_NAME Used in the output filename (default bs1_tp64_ep1)
# RESULTS_DIR Where to dump per-run log (default /tmp/bench_results/mimo_v2_flash)

set -e

source /opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/bin/activate

MODEL_PATH="${MIMO_V2_FLASH_PATH:-/opt/dlami/nvme/models/MiMo-V2-Flash-BF16}"
PORT="${PORT:-8000}"
CONCURRENCY="${CONCURRENCY:-1}"
NUM_PROMPTS="${NUM_PROMPTS:-16}"
INPUT_LEN="${INPUT_LEN:-900}"
OUTPUT_LEN="${OUTPUT_LEN:-90}"
RANGE_RATIO="${RANGE_RATIO:-0.03}"
CONFIG_NAME="${CONFIG_NAME:-bs1_tp64_ep1}"
RESULTS_DIR="${RESULTS_DIR:-/tmp/bench_results/mimo_v2_flash}"

mkdir -p "$RESULTS_DIR"

echo "=========================================="
echo "MiMo-V2-Flash single-run benchmark"
echo "=========================================="
echo " Model: $MODEL_PATH"
echo " Port: $PORT"
echo " Config: $CONFIG_NAME"
echo " Concurrency: $CONCURRENCY"
echo " Prompts: $NUM_PROMPTS"
echo " Input len: $INPUT_LEN Output len: $OUTPUT_LEN"
echo " Results: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
echo ""

# Quick health check
if ! curl -sf "http://localhost:$PORT/health" > /dev/null; then
echo "ERROR: vLLM server is not responding on http://localhost:$PORT"
echo "Start it first (e.g., bench_mimo_v2_flash.sh) and wait until"
echo "'Application startup complete.' is printed."
exit 1
fi

vllm bench serve \
--backend vllm \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--endpoint /v1/completions \
--dataset-name random \
--num-prompts "$NUM_PROMPTS" \
--random-input-len "$INPUT_LEN" \
--random-output-len "$OUTPUT_LEN" \
--random-range-ratio "$RANGE_RATIO" \
--max-concurrency "$CONCURRENCY" \
2>&1 | tee "$RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"

echo ""
echo "Saved to: $RESULTS_DIR/${CONFIG_NAME}_c${CONCURRENCY}.txt"
Loading