From 11a61a9e4d559b425f4da977dc78bc1e56ff2f94 Mon Sep 17 00:00:00 2001 From: Prashanth Palaniappan Date: Mon, 22 Dec 2025 16:48:33 -0600 Subject: [PATCH 1/4] Enable vLLM Deepseek MoE 16B Chat --- benchmark/vllm/README.md | 1 + models.json | 20 ++++++++++++++++++++ scripts/vllm/configs/default.csv | 3 +++ scripts/vllm/run.sh | 6 ++++++ 4 files changed, 30 insertions(+) diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md index 263ebd1..cd9ced8 100644 --- a/benchmark/vllm/README.md +++ b/benchmark/vllm/README.md @@ -101,6 +101,7 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki | MAD model name | Model repo | | -------------------------------------- | -------------------------------------- | | pyt_vllm_deepseek-r1 | [deepseek-ai/DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) | +| pyt_vllm_deepseek-moe-16b-chat | [deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat) | | pyt_vllm_gpt-oss-20b | [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) | | pyt_vllm_gpt-oss-120b | [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b) | | pyt_vllm_llama-2-70b | [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | diff --git a/models.json b/models.json index 7dfebf3..2880058 100644 --- a/models.json +++ b/models.json @@ -48,6 +48,26 @@ "args": "--model_repo deepseek-ai/DeepSeek-R1-0528 --config configs/default.csv" }, + { + "name": "pyt_vllm_deepseek-moe-16b-chat", + "url": "", + "dockerfile": "docker/pyt_vllm", + "scripts": "scripts/vllm/run.sh", + "data": "huggingface", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "", + "multiple_results": "perf_DeepSeek-MoE-16B-Chat.csv", + "tags": [ + "pyt", + "vllm", + "vllm_default", + "inference" + ], + "timeout": -1, + "args": + "--model_repo deepseek-ai/deepseek-moe-16b-chat --config configs/default.csv" + }, { "name": "pyt_vllm_gpt-oss-20b", "url": "", diff --git a/scripts/vllm/configs/default.csv b/scripts/vllm/configs/default.csv index 468738f..af2940b 100644 --- a/scripts/vllm/configs/default.csv +++ b/scripts/vllm/configs/default.csv @@ -31,3 +31,6 @@ mistralai/Mixtral-8x22B-Instruct-v0.1,serving,8,128 2048,128 2048,,,1 8 32 128,f amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,throughput,8,128 2048,128 2048,,1024,,float8,1024,65536,8192,0.9 amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,serving,8,128 2048,128 2048,,,1 8 32 128,float8,1024,65536,8192,0.9 + +deepseek-ai/deepseek-moe-16b-chat,throughput,8,128 2048,128 2048,,1024,,bfloat16,1024,65536,4096,0.9 +deepseek-ai/deepseek-moe-16b-chat,serving,8,128 2048,128 2048,,,1 8 32 128,bfloat16,1024,131072,4096,0.7 diff --git a/scripts/vllm/run.sh b/scripts/vllm/run.sh index 81c4c1a..d280deb 100755 --- a/scripts/vllm/run.sh +++ b/scripts/vllm/run.sh @@ -85,6 +85,12 @@ if [[ $MODEL == *"Llama-4"* ]]; then VLLM_ARGS='--compilation-config {"cudagraph_mode":"PIECEWISE","pass_config":{"enable_attn_fusion":false}}' fi +# DeepSeek MoE models are not compatible with AITER (weight shuffling issue) +if [[ $MODEL == *"deepseek-moe"* ]]; then + echo "Disabling AITER for DeepSeek MoE model (not compatible with current AITER implementation)" + export VLLM_ROCM_USE_AITER=0 +fi + # MXFP4 models are only supported on MI35x i.e. gfx950 if [[ $MODEL == *"MXFP4"* ]]; then if [[ $MAD_SYSTEM_GPU_ARCHITECTURE != *"gfx950"* ]]; then From 6fbe9c98eb62455459133d8e8ab1c39db142cb1f Mon Sep 17 00:00:00 2001 From: Prashanth Palaniappan Date: Mon, 22 Dec 2025 16:51:34 -0600 Subject: [PATCH 2/4] correct combined csv file name --- models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models.json b/models.json index 2880058..a1db1fb 100644 --- a/models.json +++ b/models.json @@ -57,7 +57,7 @@ "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_DeepSeek-MoE-16B-Chat.csv", + "multiple_results": "perf_deepseek-moe-16b-chat.csv", "tags": [ "pyt", "vllm", From deccadd15d185bd184adf52d0474be34e94a711e Mon Sep 17 00:00:00 2001 From: Prashanth Palaniappan Date: Mon, 22 Dec 2025 16:54:09 -0600 Subject: [PATCH 3/4] correct gpu utilization --- scripts/vllm/configs/default.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/vllm/configs/default.csv b/scripts/vllm/configs/default.csv index af2940b..ac14acc 100644 --- a/scripts/vllm/configs/default.csv +++ b/scripts/vllm/configs/default.csv @@ -33,4 +33,4 @@ amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,throughput,8,128 2048,128 2048,,1024,,flo amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,serving,8,128 2048,128 2048,,,1 8 32 128,float8,1024,65536,8192,0.9 deepseek-ai/deepseek-moe-16b-chat,throughput,8,128 2048,128 2048,,1024,,bfloat16,1024,65536,4096,0.9 -deepseek-ai/deepseek-moe-16b-chat,serving,8,128 2048,128 2048,,,1 8 32 128,bfloat16,1024,131072,4096,0.7 +deepseek-ai/deepseek-moe-16b-chat,serving,8,128 2048,128 2048,,,1 8 32 128,bfloat16,1024,131072,4096,0.9 From 2ec871021eb04c6f79373537f86777fddc4c6d73 Mon Sep 17 00:00:00 2001 From: Prashanth Palaniappan Date: Mon, 22 Dec 2025 16:55:18 -0600 Subject: [PATCH 4/4] correct max num batched tokens --- scripts/vllm/configs/default.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/vllm/configs/default.csv b/scripts/vllm/configs/default.csv index ac14acc..84a4fee 100644 --- a/scripts/vllm/configs/default.csv +++ b/scripts/vllm/configs/default.csv @@ -33,4 +33,4 @@ amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,throughput,8,128 2048,128 2048,,1024,,flo amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV,serving,8,128 2048,128 2048,,,1 8 32 128,float8,1024,65536,8192,0.9 deepseek-ai/deepseek-moe-16b-chat,throughput,8,128 2048,128 2048,,1024,,bfloat16,1024,65536,4096,0.9 -deepseek-ai/deepseek-moe-16b-chat,serving,8,128 2048,128 2048,,,1 8 32 128,bfloat16,1024,131072,4096,0.9 +deepseek-ai/deepseek-moe-16b-chat,serving,8,128 2048,128 2048,,,1 8 32 128,bfloat16,1024,65536,4096,0.9