aws-neuron · whn09 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/contrib/models/Qwen2.5-Omni-7B/README.md b/contrib/models/Qwen2.5-Omni-7B/README.md
diff --git a/contrib/models/Qwen2.5-Omni-7B/examples/generate_qwen25_omni.py b/contrib/models/Qwen2.5-Omni-7B/examples/generate_qwen25_omni.py
diff --git a/contrib/models/Qwen2.5-Omni-7B/examples/generate_qwen25_omni_speech.py b/contrib/models/Qwen2.5-Omni-7B/examples/generate_qwen25_omni_speech.py
diff --git a/contrib/models/Qwen2.5-Omni-7B/perf_test/3_bench_qwen25_omni_7b.sh b/contrib/models/Qwen2.5-Omni-7B/perf_test/3_bench_qwen25_omni_7b.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+set -e
+
+source /opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/bin/activate
+
+MODEL_PATH="/opt/dlami/nvme/models/Qwen2.5-Omni-7B"
+PORT=8000
+RESULTS_DIR="/var/tmp/bench_results/qwen25_omni_7b"
+mkdir -p "$RESULTS_DIR"
+
+# Helper: wait for vLLM server to be ready
+wait_for_server() {
+    echo "  Waiting for vLLM server to be ready..."
+    for i in $(seq 1 360); do
+        if curl -s http://localhost:$PORT/health > /dev/null 2>&1; then
+            echo "  Server ready! (${i}s * 5 = $((i*5))s)"
+            return 0
+        fi
+        sleep 5
+    done
+    echo "  ERROR: Server did not start within 1800s"
+    return 1
+}
+
+# Helper: run benchmark
+run_bench() {
+    local config_name=$1
+    local concurrency=$2
+    local num_prompts=$3
+
+    echo "    Benchmark: concurrency=$concurrency, prompts=$num_prompts"
+    vllm bench serve \
+        --backend vllm \
+        --model "$MODEL_PATH" \
+        --tokenizer "$MODEL_PATH" \
+        --endpoint /v1/completions \
+        --dataset-name random \
+        --num-prompts "$num_prompts" \
+        --random-input-len 900 \
+        --random-output-len 90 \
+        --random-range-ratio 0.03 \
+        --max-concurrency "$concurrency" \
+        2>&1 | tee "$RESULTS_DIR/${config_name}_c${concurrency}.txt"
+    echo ""
+}
+
+# Helper: stop server
+stop_server() {
+    echo "  Stopping vLLM server..."
+    pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
+    sleep 5
+}
+
+# Helper: quick sanity check
+sanity_check() {
+    echo "  Running sanity check..."
+    curl -s http://localhost:$PORT/v1/chat/completions \
+        -H 'Content-Type: application/json' \
+        -d '{
+            "messages": [{"role": "user", "content": "What is 1+1? Answer briefly."}],
+            "model": "'"$MODEL_PATH"'",
+            "max_tokens": 64,
+            "temperature": 0.0,
+            "stream": false
+        }' | python3 -c "import sys,json; r=json.load(sys.stdin); print('  Sanity:', r['choices'][0]['message']['content'][:100])" 2>/dev/null || echo "  Sanity check: could not parse response"
+}
+
+echo "=========================================="
+echo "Qwen2.5-Omni-7B Performance Benchmark"
+echo "=========================================="
+echo "Model: $MODEL_PATH"
+echo "Results: $RESULTS_DIR"
+echo ""
+
+###############################################################################
+# Config 1: BS=1, TP=4, non-CB (baseline latency)
+# Qwen2.5-Omni-7B is a dense 7B model, TP=4 is sufficient
+###############################################################################
+CONFIG_NAME="bs1_tp4"
+echo "--- Config 1: BS=1, TP=4, non-CB (baseline) ---"
+
+python3 -m vllm.entrypoints.openai.api_server \
+    --model "$MODEL_PATH" \
+    --tokenizer "$MODEL_PATH" \
+    --tensor-parallel-size 4 \
+    --max-model-len 4096 \
+    --max-num-seqs 1 \
+    --no-enable-chunked-prefill \
+    --no-enable-prefix-caching \
+    --port $PORT \
+    --trust_remote_code \
+    --additional-config '{
+        "override_neuron_config": {
+            "tp_degree": 4,
+            "fused_qkv": false,
+            "flash_decoding_enabled": false,
+            "sequence_parallel_enabled": false,
+            "qkv_kernel_enabled": false,
+            "qkv_nki_kernel_enabled": false,
+            "attn_kernel_enabled": false,
+            "batch_size": 1,
+            "ctx_batch_size": 1,
+            "tkg_batch_size": 1,
+            "max_context_length": 4096,
+            "seq_len": 4096,
+            "is_continuous_batching": false,
+            "enable_bucketing": false,
+            "async_mode": true,
+            "on_device_sampling_config": {
+                "do_sample": true, "temperature": 0.6, "top_k": 20, "top_p": 0.95
+            }
+        }
+    }' &
+
+wait_for_server
+sanity_check
+run_bench "$CONFIG_NAME" 1 16
+stop_server
+
+###############################################################################
+# Config 2: BS=4, TP=4, CB (throughput)
+###############################################################################
+CONFIG_NAME="bs4_tp4_cb"
+echo "--- Config 2: BS=4, TP=4, CB ---"
+
+python3 -m vllm.entrypoints.openai.api_server \
+    --model "$MODEL_PATH" \
+    --tokenizer "$MODEL_PATH" \
+    --tensor-parallel-size 4 \
+    --max-model-len 4096 \
+    --max-num-seqs 4 \
+    --no-enable-chunked-prefill \
+    --no-enable-prefix-caching \
+    --port $PORT \
+    --trust_remote_code \
+    --additional-config '{
+        "override_neuron_config": {
+            "tp_degree": 4,
+            "fused_qkv": false,
+            "flash_decoding_enabled": false,
+            "sequence_parallel_enabled": false,
+            "qkv_kernel_enabled": false,
+            "qkv_nki_kernel_enabled": false,
+            "attn_kernel_enabled": false,
+            "batch_size": 4,
+            "ctx_batch_size": 1,
+            "tkg_batch_size": 4,
+            "max_context_length": 4096,
+            "seq_len": 4096,
+            "is_continuous_batching": true,
+            "enable_bucketing": true,
+            "context_encoding_buckets": [4096],
+            "token_generation_buckets": [4096],
+            "async_mode": true,
+            "on_device_sampling_config": {
+                "do_sample": true, "temperature": 0.6, "top_k": 20, "top_p": 0.95
+            }
+        }
+    }' &
+
+wait_for_server
+sanity_check
+run_bench "$CONFIG_NAME" 1 16
+run_bench "$CONFIG_NAME" 4 64
+stop_server
+
+echo "=========================================="
+echo "Qwen2.5-Omni-7B benchmarks complete!"
+echo "Results saved to: $RESULTS_DIR"
+echo "=========================================="
+ls -la "$RESULTS_DIR"
diff --git a/contrib/models/Qwen2.5-Omni-7B/perf_test/apply_vllm_neuron_patch_qwen25omni.py b/contrib/models/Qwen2.5-Omni-7B/perf_test/apply_vllm_neuron_patch_qwen25omni.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""Add Qwen2.5-Omni model support to vllm-neuron.
+
+This patch should be applied AFTER the MiMo/MiniMax patch (apply_vllm_neuron_patch.py).
+It handles:
+  1. Config extraction: Qwen2.5-Omni nests text config under thinker_config.text_config
+  2. Architecture mapping: "Qwen2_5OmniModel" -> "qwen2_5_omni" model type
+  3. Layer count extraction: get_num_layers_from_hf_config for nested config
+"""
+
+import os
+
+# Patch 1 & 2: neuronx_distributed_model_loader.py
+LOADER_FILE = "/opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/lib/python3.12/site-packages/vllm_neuron/worker/neuronx_distributed_model_loader.py"
+
+with open(LOADER_FILE) as f:
+    content = f.read()
+
+# 1. In _get_model_configs: handle Qwen2.5-Omni nested config
+content = content.replace(
+    '    if architecture in NEURON_MULTI_MODAL_MODELS:\n'
+    '        config = getattr(config, "text_config", None)\n'
+    '    num_key_value_heads = getattr(config, "num_key_value_heads", None)',
+    '    if architecture in NEURON_MULTI_MODAL_MODELS:\n'
+    '        config = getattr(config, "text_config", None)\n'
+    '    # Qwen2.5-Omni: text config is nested under thinker_config.text_config\n'
+    '    if architecture == "Qwen2_5OmniModel":\n'
+    '        thinker_config = getattr(config, "thinker_config", None)\n'
+    '        if thinker_config is not None:\n'
+    '            config = getattr(thinker_config, "text_config", config)\n'
+    '    num_key_value_heads = getattr(config, "num_key_value_heads", None)',
+)
+
+# 2. In _get_neuron_model_cls: handle Qwen2_5OmniModel architecture
+content = content.replace(
+    '    try:\n'
+    '        if "For" in architecture:',
+    '    # Qwen2.5-Omni: architecture is "Qwen2_5OmniModel" (no "For" in name)\n'
+    '    if architecture == "Qwen2_5OmniModel":\n'
+    '        return MODEL_TYPES["qwen2_5_omni"]["causal-lm"]\n'
+    '\n'
+    '    try:\n'
+    '        if "For" in architecture:',
+)
+
+with open(LOADER_FILE, "w") as f:
+    f.write(content)
+
+print("Patch 1/2: neuronx_distributed_model_loader.py updated")
+
+# Patch 3: utils.py - handle Qwen2.5-Omni nested config for layer count
+UTILS_FILE = "/opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/lib/python3.12/site-packages/vllm_neuron/worker/utils.py"
+
+with open(UTILS_FILE) as f:
+    content = f.read()
+
+content = content.replace(
+    '    # Sum nested configs (multimodal models)\n'
+    '    total = 0\n'
+    '    for attr in ["text_config", "vision_config"]:',
+    '    # Qwen2.5-Omni: check thinker_config.text_config\n'
+    '    thinker_config = getattr(hf_config, "thinker_config", None)\n'
+    '    if thinker_config is not None:\n'
+    '        text_config = getattr(thinker_config, "text_config", None)\n'
+    '        if text_config is not None:\n'
+    '            layers = getattr(text_config, "num_hidden_layers", None)\n'
+    '            if layers is not None:\n'
+    '                return layers\n'
+    '\n'
+    '    # Sum nested configs (multimodal models)\n'
+    '    total = 0\n'
+    '    for attr in ["text_config", "vision_config"]:',
+)
+
+with open(UTILS_FILE, "w") as f:
+    f.write(content)
+
+print("Patch 2/2: utils.py updated")
+print()
+print("Qwen2.5-Omni vllm-neuron patch applied successfully!")
+print("  1. Added thinker_config.text_config extraction in _get_model_configs")
+print("  2. Added Qwen2_5OmniModel -> qwen2_5_omni mapping in _get_neuron_model_cls")
+print("  3. Added thinker_config.text_config layer count extraction in utils.py")
diff --git a/contrib/models/Qwen2.5-Omni-7B/src/__init__.py b/contrib/models/Qwen2.5-Omni-7B/src/__init__.py
@@ -1,3 +1,10 @@
-from .modeling_qwen2_5_omni import NeuronQwen2_5OmniForCausalLM, Qwen2_5OmniInferenceConfig
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Importing this package applies an upstream bug fix for
+# HuggingFaceGenerationAdapter.prepare_inputs_for_generation so that
+# adapter.generate() does not raise NameError when forwarding
+# tensor_capture_hook downstream. The fix is idempotent and only activates
+# if the upstream file still contains the bug.
 
-__all__ = ["NeuronQwen2_5OmniForCausalLM", "Qwen2_5OmniInferenceConfig"]
+from . import _upstream_compat  # noqa: F401  (side-effect import)
diff --git a/contrib/models/Qwen2.5-Omni-7B/src/_model_path.py b/contrib/models/Qwen2.5-Omni-7B/src/_model_path.py
@@ -0,0 +1,22 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Helper for resolving the Qwen2.5-Omni-7B weight path.
+#
+# Honors ``$QWEN25_OMNI_MODEL_PATH`` if it points at a directory with a
+# ``config.json``. Otherwise delegates to ``huggingface_hub.snapshot_download``
+# which is a no-op if the model is already cached and returns the real snapshot
+# directory (including the commit hash) in either case.
+
+import os
+
+
+HF_REPO_ID = "Qwen/Qwen2.5-Omni-7B"
+
+
+def resolve_model_path() -> str:
+    env = os.environ.get("QWEN25_OMNI_MODEL_PATH")
+    if env and os.path.isfile(os.path.join(env, "config.json")):
+        return env
+    from huggingface_hub import snapshot_download
+    return snapshot_download(HF_REPO_ID)