Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
326 changes: 244 additions & 82 deletions contrib/models/Qwen2.5-Omni-7B/README.md

Large diffs are not rendered by default.

456 changes: 456 additions & 0 deletions contrib/models/Qwen2.5-Omni-7B/examples/generate_qwen25_omni.py

Large diffs are not rendered by default.

763 changes: 763 additions & 0 deletions contrib/models/Qwen2.5-Omni-7B/examples/generate_qwen25_omni_speech.py

Large diffs are not rendered by default.

171 changes: 171 additions & 0 deletions contrib/models/Qwen2.5-Omni-7B/perf_test/3_bench_qwen25_omni_7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/bin/bash
set -e

source /opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/bin/activate

MODEL_PATH="/opt/dlami/nvme/models/Qwen2.5-Omni-7B"
PORT=8000
RESULTS_DIR="/var/tmp/bench_results/qwen25_omni_7b"
mkdir -p "$RESULTS_DIR"

# Helper: wait for vLLM server to be ready
wait_for_server() {
echo " Waiting for vLLM server to be ready..."
for i in $(seq 1 360); do
if curl -s http://localhost:$PORT/health > /dev/null 2>&1; then
echo " Server ready! (${i}s * 5 = $((i*5))s)"
return 0
fi
sleep 5
done
echo " ERROR: Server did not start within 1800s"
return 1
}

# Helper: run benchmark
run_bench() {
local config_name=$1
local concurrency=$2
local num_prompts=$3

echo " Benchmark: concurrency=$concurrency, prompts=$num_prompts"
vllm bench serve \
--backend vllm \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--endpoint /v1/completions \
--dataset-name random \
--num-prompts "$num_prompts" \
--random-input-len 900 \
--random-output-len 90 \
--random-range-ratio 0.03 \
--max-concurrency "$concurrency" \
2>&1 | tee "$RESULTS_DIR/${config_name}_c${concurrency}.txt"
echo ""
}

# Helper: stop server
stop_server() {
echo " Stopping vLLM server..."
pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
sleep 5
}

# Helper: quick sanity check
sanity_check() {
echo " Running sanity check..."
curl -s http://localhost:$PORT/v1/chat/completions \
-H 'Content-Type: application/json' \
-d '{
"messages": [{"role": "user", "content": "What is 1+1? Answer briefly."}],
"model": "'"$MODEL_PATH"'",
"max_tokens": 64,
"temperature": 0.0,
"stream": false
}' | python3 -c "import sys,json; r=json.load(sys.stdin); print(' Sanity:', r['choices'][0]['message']['content'][:100])" 2>/dev/null || echo " Sanity check: could not parse response"
}

echo "=========================================="
echo "Qwen2.5-Omni-7B Performance Benchmark"
echo "=========================================="
echo "Model: $MODEL_PATH"
echo "Results: $RESULTS_DIR"
echo ""

###############################################################################
# Config 1: BS=1, TP=4, non-CB (baseline latency)
# Qwen2.5-Omni-7B is a dense 7B model, TP=4 is sufficient
###############################################################################
CONFIG_NAME="bs1_tp4"
echo "--- Config 1: BS=1, TP=4, non-CB (baseline) ---"

python3 -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--tensor-parallel-size 4 \
--max-model-len 4096 \
--max-num-seqs 1 \
--no-enable-chunked-prefill \
--no-enable-prefix-caching \
--port $PORT \
--trust_remote_code \
--additional-config '{
"override_neuron_config": {
"tp_degree": 4,
"fused_qkv": false,
"flash_decoding_enabled": false,
"sequence_parallel_enabled": false,
"qkv_kernel_enabled": false,
"qkv_nki_kernel_enabled": false,
"attn_kernel_enabled": false,
"batch_size": 1,
"ctx_batch_size": 1,
"tkg_batch_size": 1,
"max_context_length": 4096,
"seq_len": 4096,
"is_continuous_batching": false,
"enable_bucketing": false,
"async_mode": true,
"on_device_sampling_config": {
"do_sample": true, "temperature": 0.6, "top_k": 20, "top_p": 0.95
}
}
}' &

wait_for_server
sanity_check
run_bench "$CONFIG_NAME" 1 16
stop_server

###############################################################################
# Config 2: BS=4, TP=4, CB (throughput)
###############################################################################
CONFIG_NAME="bs4_tp4_cb"
echo "--- Config 2: BS=4, TP=4, CB ---"

python3 -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--tokenizer "$MODEL_PATH" \
--tensor-parallel-size 4 \
--max-model-len 4096 \
--max-num-seqs 4 \
--no-enable-chunked-prefill \
--no-enable-prefix-caching \
--port $PORT \
--trust_remote_code \
--additional-config '{
"override_neuron_config": {
"tp_degree": 4,
"fused_qkv": false,
"flash_decoding_enabled": false,
"sequence_parallel_enabled": false,
"qkv_kernel_enabled": false,
"qkv_nki_kernel_enabled": false,
"attn_kernel_enabled": false,
"batch_size": 4,
"ctx_batch_size": 1,
"tkg_batch_size": 4,
"max_context_length": 4096,
"seq_len": 4096,
"is_continuous_batching": true,
"enable_bucketing": true,
"context_encoding_buckets": [4096],
"token_generation_buckets": [4096],
"async_mode": true,
"on_device_sampling_config": {
"do_sample": true, "temperature": 0.6, "top_k": 20, "top_p": 0.95
}
}
}' &

wait_for_server
sanity_check
run_bench "$CONFIG_NAME" 1 16
run_bench "$CONFIG_NAME" 4 64
stop_server

echo "=========================================="
echo "Qwen2.5-Omni-7B benchmarks complete!"
echo "Results saved to: $RESULTS_DIR"
echo "=========================================="
ls -la "$RESULTS_DIR"
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""Add Qwen2.5-Omni model support to vllm-neuron.

This patch should be applied AFTER the MiMo/MiniMax patch (apply_vllm_neuron_patch.py).
It handles:
1. Config extraction: Qwen2.5-Omni nests text config under thinker_config.text_config
2. Architecture mapping: "Qwen2_5OmniModel" -> "qwen2_5_omni" model type
3. Layer count extraction: get_num_layers_from_hf_config for nested config
"""

import os

# Patch 1 & 2: neuronx_distributed_model_loader.py
LOADER_FILE = "/opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/lib/python3.12/site-packages/vllm_neuron/worker/neuronx_distributed_model_loader.py"

with open(LOADER_FILE) as f:
content = f.read()

# 1. In _get_model_configs: handle Qwen2.5-Omni nested config
content = content.replace(
' if architecture in NEURON_MULTI_MODAL_MODELS:\n'
' config = getattr(config, "text_config", None)\n'
' num_key_value_heads = getattr(config, "num_key_value_heads", None)',
' if architecture in NEURON_MULTI_MODAL_MODELS:\n'
' config = getattr(config, "text_config", None)\n'
' # Qwen2.5-Omni: text config is nested under thinker_config.text_config\n'
' if architecture == "Qwen2_5OmniModel":\n'
' thinker_config = getattr(config, "thinker_config", None)\n'
' if thinker_config is not None:\n'
' config = getattr(thinker_config, "text_config", config)\n'
' num_key_value_heads = getattr(config, "num_key_value_heads", None)',
)

# 2. In _get_neuron_model_cls: handle Qwen2_5OmniModel architecture
content = content.replace(
' try:\n'
' if "For" in architecture:',
' # Qwen2.5-Omni: architecture is "Qwen2_5OmniModel" (no "For" in name)\n'
' if architecture == "Qwen2_5OmniModel":\n'
' return MODEL_TYPES["qwen2_5_omni"]["causal-lm"]\n'
'\n'
' try:\n'
' if "For" in architecture:',
)

with open(LOADER_FILE, "w") as f:
f.write(content)

print("Patch 1/2: neuronx_distributed_model_loader.py updated")

# Patch 3: utils.py - handle Qwen2.5-Omni nested config for layer count
UTILS_FILE = "/opt/aws_neuronx_venv_pytorch_2_9_nxd_inference/lib/python3.12/site-packages/vllm_neuron/worker/utils.py"

with open(UTILS_FILE) as f:
content = f.read()

content = content.replace(
' # Sum nested configs (multimodal models)\n'
' total = 0\n'
' for attr in ["text_config", "vision_config"]:',
' # Qwen2.5-Omni: check thinker_config.text_config\n'
' thinker_config = getattr(hf_config, "thinker_config", None)\n'
' if thinker_config is not None:\n'
' text_config = getattr(thinker_config, "text_config", None)\n'
' if text_config is not None:\n'
' layers = getattr(text_config, "num_hidden_layers", None)\n'
' if layers is not None:\n'
' return layers\n'
'\n'
' # Sum nested configs (multimodal models)\n'
' total = 0\n'
' for attr in ["text_config", "vision_config"]:',
)

with open(UTILS_FILE, "w") as f:
f.write(content)

print("Patch 2/2: utils.py updated")
print()
print("Qwen2.5-Omni vllm-neuron patch applied successfully!")
print(" 1. Added thinker_config.text_config extraction in _get_model_configs")
print(" 2. Added Qwen2_5OmniModel -> qwen2_5_omni mapping in _get_neuron_model_cls")
print(" 3. Added thinker_config.text_config layer count extraction in utils.py")
11 changes: 9 additions & 2 deletions contrib/models/Qwen2.5-Omni-7B/src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
from .modeling_qwen2_5_omni import NeuronQwen2_5OmniForCausalLM, Qwen2_5OmniInferenceConfig
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Importing this package applies an upstream bug fix for
# HuggingFaceGenerationAdapter.prepare_inputs_for_generation so that
# adapter.generate() does not raise NameError when forwarding
# tensor_capture_hook downstream. The fix is idempotent and only activates
# if the upstream file still contains the bug.

__all__ = ["NeuronQwen2_5OmniForCausalLM", "Qwen2_5OmniInferenceConfig"]
from . import _upstream_compat # noqa: F401 (side-effect import)
22 changes: 22 additions & 0 deletions contrib/models/Qwen2.5-Omni-7B/src/_model_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Helper for resolving the Qwen2.5-Omni-7B weight path.
#
# Honors ``$QWEN25_OMNI_MODEL_PATH`` if it points at a directory with a
# ``config.json``. Otherwise delegates to ``huggingface_hub.snapshot_download``
# which is a no-op if the model is already cached and returns the real snapshot
# directory (including the commit hash) in either case.

import os


HF_REPO_ID = "Qwen/Qwen2.5-Omni-7B"


def resolve_model_path() -> str:
env = os.environ.get("QWEN25_OMNI_MODEL_PATH")
if env and os.path.isfile(os.path.join(env, "config.json")):
return env
from huggingface_hub import snapshot_download
return snapshot_download(HF_REPO_ID)
Loading