Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ add_subdirectory(minicpm_o)
add_subdirectory(minicpm4)
add_subdirectory(qwen3)
add_subdirectory(qwen3_service)
add_subdirectory(qwen3_moe)
add_subdirectory(deepseek_ocr)

if(MLLM_BUILD_QNN_BACKEND)
Expand Down
3 changes: 3 additions & 0 deletions examples/qwen3_moe/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
add_executable(mllm-qwen3-moe-runner main.cpp)
target_link_libraries(mllm-qwen3-moe-runner PRIVATE MllmRT MllmCPUBackend)
target_include_directories(mllm-qwen3-moe-runner PRIVATE ${MLLM_INCLUDE_DIR})
37 changes: 37 additions & 0 deletions examples/qwen3_moe/config_30B_A3B_gguf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"architectures": [
"Qwen3MoeForCausalLM"
],
"attention_bias": false,
"bos_token_id": 151643,
"decoder_sparse_step": 1,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 6144,
"max_position_embeddings": 262144,
"max_window_layers": 48,
"mlp_only_layers": [],
"model_type": "qwen3_moe",
"moe_intermediate_size": 768,
"norm_topk_prob": true,
"num_attention_heads": 32,
"num_experts": 128,
"num_experts_per_tok": 8,
"num_hidden_layers": 48,
"num_key_value_heads": 4,
"output_router_logits": false,
"rms_norm_eps": 1e-06,
"rope_scaling": 1.0,
"rope_theta": 10000000,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

rope_theta value here (10000000) differs from the default in configuration_qwen3_moe.hpp (1000000.0).

The JSON provides 10000000 (10⁷) while the C++ default at configuration_qwen3_moe.hpp:60 is 1000000.0 (10⁶). When loading from JSON this is fine, but anyone using the default constructor gets the wrong value. Please align the C++ default with the model's actual rope_theta.

🤖 Prompt for AI Agents
In `@examples/qwen3_moe/config_30B_A3B_gguf.json` at line 28, The default
rope_theta in configuration_qwen3_moe.hpp is mismatched with the model JSON;
update the default value used by the class (the member/initializer named
rope_theta in the configuration_qwen3_moe.hpp default constructor or in-class
initializer) from 1e6 (1000000.0) to 1e7 (10000000.0) so it matches the
examples/qwen3_moe/config_30B_A3B_gguf.json value.

"router_aux_loss_coef": 0.001,
"tie_word_embeddings": true,
"transformers_version": "4.51.0",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936,
"max_cache_length": 16384,
"linear_impl_type": "Default"
}
80 changes: 80 additions & 0 deletions examples/qwen3_moe/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include <iostream>
#include <fmt/core.h>
#include <mllm/mllm.hpp>
#include <mllm/models/qwen3_moe/modeling_qwen3_moe_fa2.hpp>
#include <mllm/models/qwen3_moe/tokenization_qwen3_moe.hpp>
#include <mllm/utils/AnyValue.hpp>

using mllm::Argparse;

MLLM_MAIN({
auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true);
auto& model_version = Argparse::add<std::string>("-mv|--model_version").help("Model version").required(true);
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer_path").help("Tokenizer directory").required(true);
auto& config_path = Argparse::add<std::string>("-c|--config_path").help("Config path").required(true);

Argparse::parse(argc, argv);

#ifdef MLLM_PERFETTO_ENABLE
mllm::perf::start();
#endif

mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1;
if (model_version.get() == "v1") {
file_version = mllm::ModelFileVersion::kV1;
} else if (model_version.get() == "v2") {
file_version = mllm::ModelFileVersion::kV2;
} else {
fmt::print("❌ Unsupported model_version: {} (expected v1 or v2)\n", model_version.get());
mllm::shutdownContext();
return 1;
}

if (help.isSet()) {
Argparse::printHelp();
mllm::shutdownContext();
return 0;
}
Comment on lines +23 to +38
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, locate Argparse.hpp
find . -name "Argparse.hpp" -type f

Repository: UbiquitousLearning/mllm

Length of output: 92


🏁 Script executed:

# Once located, check the implementation of required arguments and get() method
rg -n "required" mllm/utils/Argparse.hpp -A 3 -B 1

Repository: UbiquitousLearning/mllm

Length of output: 605


🏁 Script executed:

# Also check the get() method implementation
rg -n "\.get\(\)" mllm/utils/Argparse.hpp -A 2 -B 1

Repository: UbiquitousLearning/mllm

Length of output: 484


🏁 Script executed:

# Look at the full context of main.cpp around the lines in question
head -50 examples/qwen3_moe/main.cpp | tail -40

Repository: UbiquitousLearning/mllm

Length of output: 1651


🏁 Script executed:

# Get the full parse() method implementation
rg -n "void parse" mllm/utils/Argparse.hpp -A 30

Repository: UbiquitousLearning/mllm

Length of output: 3232


🏁 Script executed:

# Also check if help flag has any special behavior
rg -n "help.*isSet\|--help" mllm/utils/Argparse.hpp -B 2 -A 2

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Get more of the parse method to see required arg validation
rg -n "static void parse" mllm/utils/Argparse.hpp -A 50 | head -80

Repository: UbiquitousLearning/mllm

Length of output: 1821


🏁 Script executed:

# Check if there's special handling for help before validation
rg -n "MLLM_ERROR_EXIT.*required" mllm/utils/Argparse.hpp -B 5 -A 2

Repository: UbiquitousLearning/mllm

Length of output: 344


🏁 Script executed:

# Get the complete parse method to see if there's early exit handling for help
rg -n "static void parse" mllm/utils/Argparse.hpp -A 70

Repository: UbiquitousLearning/mllm

Length of output: 2631


--help check happens after parse(), which exits if required arguments are missing—help will be unreachable.

Argparse::parse() validates required arguments at the end (lines 200-205 in Argparse.hpp) and calls MLLM_ERROR_EXIT if any are missing. The --help flag is parsed successfully within parse(), but the subsequent required argument validation still exits before the help check at line 34 executes. Move the help check before calling parse() to make help accessible without required arguments.

Proposed fix
   Argparse::parse(argc, argv);

+  if (help.isSet()) {
+    Argparse::printHelp();
+    mllm::shutdownContext();
+    return 0;
+  }
+
   mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1;
   if (model_version.get() == "v1") {
     file_version = mllm::ModelFileVersion::kV1;
   } else if (model_version.get() == "v2") {
     file_version = mllm::ModelFileVersion::kV2;
   } else {
     fmt::print("❌ Unsupported model_version: {} (expected v1 or v2)\n", model_version.get());
     mllm::shutdownContext();
     return 1;
    }

-  if (help.isSet()) {
-    Argparse::printHelp();
-    mllm::shutdownContext();
-    return 0;
-  }
🤖 Prompt for AI Agents
In `@examples/qwen3_moe/main.cpp` around lines 23 - 38, Move the help handling so
the program prints help without requiring other args: call Argparse::parse()
only after checking if help.isSet(), i.e., check help.isSet() immediately after
constructing/parsing raw flags (or the minimal pre-parse if needed) and call
Argparse::printHelp() then mllm::shutdownContext() and return 0; ensure this
check runs before invoking Argparse::parse() (which performs required-argument
validation and may exit), and keep references to Argparse::printHelp(),
help.isSet(), Argparse::parse(), and mllm::shutdownContext() while relocating
the check.


{
auto qwen3_moe_cfg = mllm::models::qwen3_moe::Qwen3MoeConfig(config_path.get());
auto qwen3_moe_tokenizer = mllm::models::qwen3_moe::Qwen3Tokenizer(tokenizer_path.get());
auto qwen3_moe = mllm::models::qwen3_moe::Qwen3MoeForCausalLM(qwen3_moe_cfg);

auto param = mllm::load(model_path.get(), file_version);
qwen3_moe.load(param);

fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
fmt::print("Enter 'exit' or 'quit' to end the session\n\n");

std::string prompt_text;

fmt::print("💬 Prompt text (or 'exit/quit'): ");
std::getline(std::cin, prompt_text);

if(prompt_text == "exit" || prompt_text == "quit") { return 0; }
Comment on lines +48 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

UI says "interactive" but implementation is single-prompt only.

Lines 48-49 suggest an interactive session ("Enter 'exit' or 'quit' to end the session"), but there is no loop — only one prompt is accepted. Either wrap lines 53-68 in a while(true) loop or update the banner to reflect single-prompt behavior.

🤖 Prompt for AI Agents
In `@examples/qwen3_moe/main.cpp` around lines 48 - 56, The UI claims an
interactive CLI but main flow only reads one prompt into prompt_text then exits;
wrap the prompt/read/response/exit-check sequence (the block using prompt_text
and the prompt/ getline logic) inside a loop (e.g., while(true)) so the program
repeatedly prompts until prompt_text == "exit" || "quit", or alternatively
update the banner/strings to remove "interactive" and the "Enter 'exit' or
'quit' to end the session" line; modify the code paths referencing prompt_text
to support repeated iterations and clean termination.


try {
fmt::print("🔄 Processing...\n");
auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});

fmt::print("\n🤖 Response: ");

// Use for loop
for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }

fmt::print("\n{}\n", std::string(60, '-'));
} catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }

Comment on lines +48 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Honor the advertised exit/quit commands.
The prompt tells users these commands end the session, but the input is always processed.

🛠️ Proposed fix
     fmt::print("💬 Prompt text (or 'exit/quit'): ");
     std::getline(std::cin, prompt_text);
 
-    try {
-      fmt::print("🔄 Processing...\n");
-      auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
-
-      fmt::print("\n🤖 Response: ");
-
-      // Use for loop
-      for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
-
-      fmt::print("\n{}\n", std::string(60, '-'));
-    } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+    if (prompt_text == "exit" || prompt_text == "quit") {
+      fmt::print("👋 Bye!\n");
+    } else {
+      try {
+        fmt::print("🔄 Processing...\n");
+        auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
+
+        fmt::print("\n🤖 Response: ");
+
+        // Use for loop
+        for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
+
+        fmt::print("\n{}\n", std::string(60, '-'));
+      } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+    }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
std::string prompt_text;
fmt::print("💬 Prompt text (or 'exit/quit'): ");
std::getline(std::cin, prompt_text);
try {
fmt::print("🔄 Processing...\n");
auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
fmt::print("\n🤖 Response: ");
// Use for loop
for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
fmt::print("\n{}\n", std::string(60, '-'));
} catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
std::string prompt_text;
fmt::print("💬 Prompt text (or 'exit/quit'): ");
std::getline(std::cin, prompt_text);
if (prompt_text == "exit" || prompt_text == "quit") {
fmt::print("👋 Bye!\n");
} else {
try {
fmt::print("🔄 Processing...\n");
auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
fmt::print("\n🤖 Response: ");
// Use for loop
for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
fmt::print("\n{}\n", std::string(60, '-'));
} catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
}
🤖 Prompt for AI Agents
In `@examples/qwen3_moe/main.cpp` around lines 44 - 63, The loop currently always
processes input even when the user types the advertised "exit" or "quit"; after
reading prompt_text (variable prompt_text) trim/normalize it and check for
"exit" or "quit" (case-insensitive) and if matched, break out of the interactive
loop or return from main instead of calling qwen3_moe_tokenizer.convertMessage
and qwen3_moe.chat; place this check immediately after std::getline and before
the try block so convertMessage and qwen3_moe.chat are not invoked for
exit/quit.

qwen3_moe.perfSummary();
}

#ifdef MLLM_PERFETTO_ENABLE
mllm::perf::stop();
mllm::perf::saveReport("qwen3_moe.perf");
#endif

mllm::print("\n");
mllm::memoryReport();
})
79 changes: 79 additions & 0 deletions examples/qwen3_moe/quant_cfg_30B_q4_k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"^model\\.layers\\.\\d+\\.self_attn\\.q_proj.(bias|weight)": {
"hints": {
"quant_method": "gguf",
"gguf_type": "Q4_K",
"shape": [
4096,
2048
],
"replace": true
}
},
"^model\\.layers\\.\\d+\\.self_attn\\.k_proj.(bias|weight)": {
"hints": {
"quant_method": "gguf",
"gguf_type": "Q4_K",
"shape": [
512,
2048
],
"replace": true
}
},
"^model\\.layers\\.\\d+\\.self_attn\\.v_proj.(bias|weight)": {
"hints": {
"quant_method": "gguf",
"gguf_type": "Q6_K",
"shape": [
512,
2048
],
"replace": true
}
},
"^model\\.layers\\.\\d+\\.self_attn\\.o_proj.(bias|weight)": {
"hints": {
"quant_method": "gguf",
"gguf_type": "Q4_K",
"shape": [
2048,
4096
],
"replace": true
}
},
"^model\\.layers\\.\\d+\\.mlp\\.experts\\.\\d+\\.up_proj.(bias|weight)": {
"hints": {
"quant_method": "gguf",
"gguf_type": "Q4_K",
"shape": [
768,
2048
],
"replace": true
}
},
"^model\\.layers\\.\\d+\\.mlp\\.experts\\.\\d+\\.down_proj.(bias|weight)": {
"hints": {
"quant_method": "gguf",
"gguf_type": "Q6_K",
"shape": [
2048,
768
],
"replace": true
}
},
Comment on lines +46 to +67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# Check if gate_proj exists in quantization configs and model definitions
rg -n "gate_proj" --type=json

Repository: UbiquitousLearning/mllm

Length of output: 1218


🏁 Script executed:

# Also search in Python files for model definitions
rg -n "gate_proj" --type=py

Repository: UbiquitousLearning/mllm

Length of output: 1146


🏁 Script executed:

# List all quantization config files for comparison
fd -e json -p "quant_cfg"

Repository: UbiquitousLearning/mllm

Length of output: 477


🏁 Script executed:

# Search for qwen3_moe model definitions
fd -p "qwen3_moe" --type=py | head -20

Repository: UbiquitousLearning/mllm

Length of output: 236


🏁 Script executed:

# Check the qwen3_moe model file for MoE expert structure and gate_proj
fd -p "modeling_qwen3" --type=py | grep -i moe

Repository: UbiquitousLearning/mllm

Length of output: 236


🏁 Script executed:

# Search for Qwen3MoeMLP or similar MoE expert definitions
rg -n "Qwen3.*Moe|MoeMLP" --type=py -A 10

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Search for Qwen3MoeMLP or MoE expert definitions
rg -n "Qwen3.*Moe|MoeMLP" --type-add "py:*.py"

Repository: UbiquitousLearning/mllm

Length of output: 3702


🏁 Script executed:

# Search for qwen3_moe references in Python files
rg -n "qwen3_moe"

Repository: UbiquitousLearning/mllm

Length of output: 1964


🏁 Script executed:

# Search for expert projection definitions with gate_proj
rg -n "experts.*gate_proj|gate_proj.*experts"

Repository: UbiquitousLearning/mllm

Length of output: 50379


🏁 Script executed:

cat mllm/models/qwen3_moe/modeling_qwen3_moe_fa2.hpp | sed -n '71,112p'

Repository: UbiquitousLearning/mllm

Length of output: 1468


🏁 Script executed:

# Check if gate_proj is mentioned anywhere in the qwen3_moe model file
rg -n "gate_proj" mllm/models/qwen3_moe/

Repository: UbiquitousLearning/mllm

Length of output: 436


🏁 Script executed:

# Also check the current quant_cfg_30B_q4_k.json file directly for context
cat examples/qwen3_moe/quant_cfg_30B_q4_k.json

Repository: UbiquitousLearning/mllm

Length of output: 1612


Add gate_proj to MoE expert quantization config.

The model defines three projections per MoE expert (gate_proj, up_proj, down_proj), but only up_proj and down_proj are quantized. The gate_proj is actively used in the forward pass yet missing from the config. If intentional, add a comment; otherwise, add the following pattern:

"^model\\.layers\\.\\d+\\.mlp\\.experts\\.\\d+\\.gate_proj.(bias|weight)": {
  "hints": {
    "quant_method": "gguf",
    "gguf_type": "Q4_K",
    "shape": [2048, 2048],
    "replace": true
  }
}
🤖 Prompt for AI Agents
In `@examples/qwen3_moe/quant_cfg_30B_q4_k.json` around lines 46 - 67, The MoE
expert config is missing the gate_proj pattern used in the forward pass; add a
new JSON entry matching
"^model\\.layers\\.\\d+\\.mlp\\.experts\\.\\d+\\.gate_proj.(bias|weight)" with
the same quant hints style as up_proj/down_proj (quant_method "gguf", gguf_type
"Q4_K", shape [2048,2048], replace true) so gate_proj weights are quantized, or
if omission was intentional, add an inline comment next to the up_proj/down_proj
blocks explaining why gate_proj is excluded; reference the existing up_proj,
down_proj and the model.layers.*.mlp.experts.*.gate_proj symbol to locate where
to add the change.

"^lm_head.weight": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Unescaped dot in lm_head.weight regex pattern.

All other patterns escape dots as \\. (e.g., self_attn\\.q_proj), but lm_head.weight uses an unescaped . which matches any character in regex. Should be ^lm_head\\.weight for consistency and correctness.

Proposed fix
-  "^lm_head.weight": {
+  "^lm_head\\.weight": {
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"^lm_head.weight": {
"^lm_head\\.weight": {
🤖 Prompt for AI Agents
In `@examples/qwen3_moe/quant_cfg_30B_q4_k.json` at line 68, The regex entry for
"lm_head.weight" uses an unescaped dot which matches any character; update the
pattern to escape the dot (change the pattern for lm_head.weight to use
lm_head\\.weight) to match the literal field name consistently with other keys
(e.g., self_attn\\.q_proj) so the config only targets the intended parameter.

"hints": {
"quant_method": "gguf",
"gguf_type": "Q4_K",
"shape": [
151936,
2048
],
"replace": true
}
}
}
Loading