diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md index 263ebd1..cd9ced8 100644 --- a/benchmark/vllm/README.md +++ b/benchmark/vllm/README.md @@ -101,6 +101,7 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki | MAD model name | Model repo | | -------------------------------------- | -------------------------------------- | | pyt_vllm_deepseek-r1 | [deepseek-ai/DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) | +| pyt_vllm_deepseek-moe-16b-chat | [deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat) | | pyt_vllm_gpt-oss-20b | [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) | | pyt_vllm_gpt-oss-120b | [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b) | | pyt_vllm_llama-2-70b | [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | diff --git a/models.json b/models.json index 7dfebf3..a1db1fb 100644 --- a/models.json +++ b/models.json @@ -48,6 +48,26 @@ "args": "--model_repo deepseek-ai/DeepSeek-R1-0528 --config configs/default.csv" }, + { + "name": "pyt_vllm_deepseek-moe-16b-chat", + "url": "", + "dockerfile": "docker/pyt_vllm", + "scripts": "scripts/vllm/run.sh", + "data": "huggingface", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "", + "multiple_results": "perf_deepseek-moe-16b-chat.csv", + "tags": [ + "pyt", + "vllm", + "vllm_default", + "inference" + ], + "timeout": -1, + "args": + "--model_repo deepseek-ai/deepseek-moe-16b-chat --config configs/default.csv" + }, { "name": "pyt_vllm_gpt-oss-20b", "url": "", diff --git a/scripts/vllm/configs/default.csv b/scripts/vllm/configs/default.csv index 468738f..84a4fee 100644 --- a/scripts/vllm/configs/default.csv +++ b/scripts/vllm/configs/default.csv @@ -31,3 +31,6 @@ mistralai/Mixtral-8x22B-Instruct-v0.1,serving,8,128 2048,128 2048,,,1 8 32 128,f amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,throughput,8,128 2048,128 2048,,1024,,float8,1024,65536,8192,0.9 amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,serving,8,128 2048,128 2048,,,1 8 32 128,float8,1024,65536,8192,0.9 + +deepseek-ai/deepseek-moe-16b-chat,throughput,8,128 2048,128 2048,,1024,,bfloat16,1024,65536,4096,0.9 +deepseek-ai/deepseek-moe-16b-chat,serving,8,128 2048,128 2048,,,1 8 32 128,bfloat16,1024,65536,4096,0.9 diff --git a/scripts/vllm/run.sh b/scripts/vllm/run.sh index 81c4c1a..d280deb 100755 --- a/scripts/vllm/run.sh +++ b/scripts/vllm/run.sh @@ -85,6 +85,12 @@ if [[ $MODEL == *"Llama-4"* ]]; then VLLM_ARGS='--compilation-config {"cudagraph_mode":"PIECEWISE","pass_config":{"enable_attn_fusion":false}}' fi +# DeepSeek MoE models are not compatible with AITER (weight shuffling issue) +if [[ $MODEL == *"deepseek-moe"* ]]; then + echo "Disabling AITER for DeepSeek MoE model (not compatible with current AITER implementation)" + export VLLM_ROCM_USE_AITER=0 +fi + # MXFP4 models are only supported on MI35x i.e. gfx950 if [[ $MODEL == *"MXFP4"* ]]; then if [[ $MAD_SYSTEM_GPU_ARCHITECTURE != *"gfx950"* ]]; then