UbiquitousLearning · oreomaker · Feb 17, 2026 · Jan 7, 2026 · Jan 30, 2026 · Feb 15, 2026
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory(minicpm_o)
 add_subdirectory(minicpm4)
 add_subdirectory(qwen3)
 add_subdirectory(qwen3_service)
+add_subdirectory(qwen3_moe)
 add_subdirectory(deepseek_ocr)
 
 if(MLLM_BUILD_QNN_BACKEND)

diff --git a/examples/qwen3_moe/CMakeLists.txt b/examples/qwen3_moe/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(mllm-qwen3-moe-runner main.cpp)
+target_link_libraries(mllm-qwen3-moe-runner PRIVATE MllmRT MllmCPUBackend)
+target_include_directories(mllm-qwen3-moe-runner PRIVATE ${MLLM_INCLUDE_DIR})
diff --git a/examples/qwen3_moe/config_30B_A3B_gguf.json b/examples/qwen3_moe/config_30B_A3B_gguf.json
@@ -0,0 +1,37 @@
+{
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "bos_token_id": 151643,
+  "decoder_sparse_step": 1,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 48,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 768,
+  "norm_topk_prob": true,
+  "num_attention_heads": 32,
+  "num_experts": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": 1.0,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.001,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "max_cache_length": 16384,
+  "linear_impl_type": "Default"
+}
diff --git a/examples/qwen3_moe/main.cpp b/examples/qwen3_moe/main.cpp
@@ -0,0 +1,80 @@
+#include <iostream>
+#include <fmt/core.h>
+#include <mllm/mllm.hpp>
+#include <mllm/models/qwen3_moe/modeling_qwen3_moe_fa2.hpp>
+#include <mllm/models/qwen3_moe/tokenization_qwen3_moe.hpp>
+#include <mllm/utils/AnyValue.hpp>
+
+using mllm::Argparse;
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true);
+  auto& model_version = Argparse::add<std::string>("-mv|--model_version").help("Model version").required(true);
+  auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer_path").help("Tokenizer directory").required(true);
+  auto& config_path = Argparse::add<std::string>("-c|--config_path").help("Config path").required(true);
+
+  Argparse::parse(argc, argv);
+
+#ifdef MLLM_PERFETTO_ENABLE
+  mllm::perf::start();
+#endif
+
+  mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1;
+  if (model_version.get() == "v1") {
+    file_version = mllm::ModelFileVersion::kV1;
+  } else if (model_version.get() == "v2") {
+    file_version = mllm::ModelFileVersion::kV2;
+  } else {
+    fmt::print("❌ Unsupported model_version: {} (expected v1 or v2)\n", model_version.get());
+    mllm::shutdownContext();
+    return 1;
+   }
+
+  if (help.isSet()) {
+    Argparse::printHelp();
+    mllm::shutdownContext();
+    return 0;
+  }
+
+  {
+    auto qwen3_moe_cfg = mllm::models::qwen3_moe::Qwen3MoeConfig(config_path.get());
+    auto qwen3_moe_tokenizer = mllm::models::qwen3_moe::Qwen3Tokenizer(tokenizer_path.get());
+    auto qwen3_moe = mllm::models::qwen3_moe::Qwen3MoeForCausalLM(qwen3_moe_cfg);
+
+    auto param = mllm::load(model_path.get(), file_version);
+    qwen3_moe.load(param);
+
+    fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
+    fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
+
+    std::string prompt_text;
+
+    fmt::print("💬 Prompt text (or 'exit/quit'): ");
+    std::getline(std::cin, prompt_text);
+
+    if(prompt_text == "exit" || prompt_text == "quit") { return 0; }
+
+    try {
+      fmt::print("🔄 Processing...\n");
+      auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
+
+      fmt::print("\n🤖 Response: ");
+
+      // Use for loop
+      for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
+
+      fmt::print("\n{}\n", std::string(60, '-'));
+    } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+
-    fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
-    fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
-
-    std::string prompt_text;
-
-    fmt::print("💬 Prompt text (or 'exit/quit'): ");
-    std::getline(std::cin, prompt_text);
-
-    try {
-      fmt::print("🔄 Processing...\n");
-      auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
-
-      fmt::print("\n🤖 Response: ");
-
-      // Use for loop
-      for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
-
-      fmt::print("\n{}\n", std::string(60, '-'));
-    } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+    fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
+    fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
+
+    std::string prompt_text;
+
+    fmt::print("💬 Prompt text (or 'exit/quit'): ");
+    std::getline(std::cin, prompt_text);
+
+    if (prompt_text == "exit" || prompt_text == "quit") {
+      fmt::print("👋 Bye!\n");
+    } else {
+      try {
+        fmt::print("🔄 Processing...\n");
+        auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
+
+        fmt::print("\n🤖 Response: ");
+
+        // Use for loop
+        for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
+
+        fmt::print("\n{}\n", std::string(60, '-'));
+      } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+    }
-    fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
-    fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
-
-    std::string prompt_text;
-
-    fmt::print("💬 Prompt text (or 'exit/quit'): ");
-    std::getline(std::cin, prompt_text);
-
-    try {
-      fmt::print("🔄 Processing...\n");
-      auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
-
-      fmt::print("\n🤖 Response: ");
-
-      // Use for loop
-      for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
-
-      fmt::print("\n{}\n", std::string(60, '-'));
-    } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+    fmt::print("\n{:*^60}\n", " Qwen3 MoE Interactive CLI ");
+    fmt::print("Enter 'exit' or 'quit' to end the session\n\n");
+
+    std::string prompt_text;
+
+    fmt::print("💬 Prompt text (or 'exit/quit'): ");
+    std::getline(std::cin, prompt_text);
+
+    if (prompt_text == "exit" || prompt_text == "quit") {
+      fmt::print("👋 Bye!\n");
+    } else {
+      try {
+        fmt::print("🔄 Processing...\n");
+        auto inputs = qwen3_moe_tokenizer.convertMessage({.prompt = prompt_text});
+
+        fmt::print("\n🤖 Response: ");
+
+        // Use for loop
+        for (auto& step : qwen3_moe.chat(inputs)) { std::wcout << qwen3_moe_tokenizer.detokenize(step.cur_token_id) << std::flush; }
+
+        fmt::print("\n{}\n", std::string(60, '-'));
+      } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); }
+    }
+    qwen3_moe.perfSummary();
+  }
+
+#ifdef MLLM_PERFETTO_ENABLE
+  mllm::perf::stop();
+  mllm::perf::saveReport("qwen3_moe.perf");
+#endif
+
+  mllm::print("\n");
+  mllm::memoryReport();
+})
diff --git a/examples/qwen3_moe/quant_cfg_30B_q4_k.json b/examples/qwen3_moe/quant_cfg_30B_q4_k.json
@@ -0,0 +1,79 @@
+{
+  "^model\\.layers\\.\\d+\\.self_attn\\.q_proj.(bias|weight)": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q4_K",
+      "shape": [
+        4096,
+        2048
+      ],
+      "replace": true
+    }
+  },
+  "^model\\.layers\\.\\d+\\.self_attn\\.k_proj.(bias|weight)": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q4_K",
+      "shape": [
+        512,
+        2048
+      ],
+      "replace": true
+    }
+  },
+  "^model\\.layers\\.\\d+\\.self_attn\\.v_proj.(bias|weight)": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q6_K",
+      "shape": [
+        512,
+        2048
+      ],
+      "replace": true
+    }
+  },
+  "^model\\.layers\\.\\d+\\.self_attn\\.o_proj.(bias|weight)": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q4_K",
+      "shape": [
+        2048,
+        4096
+      ],
+      "replace": true
+    }
+  },
+  "^model\\.layers\\.\\d+\\.mlp\\.experts\\.\\d+\\.up_proj.(bias|weight)": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q4_K",
+      "shape": [
+        768,
+        2048
+      ],
+      "replace": true
+    }
+  },
+  "^model\\.layers\\.\\d+\\.mlp\\.experts\\.\\d+\\.down_proj.(bias|weight)": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q6_K",
+      "shape": [
+        2048,
+        768
+      ],
+      "replace": true
+    }
+  },
+  "^lm_head.weight": {
-  "^lm_head.weight": {
+  "^lm_head\\.weight": {
-  "^lm_head.weight": {
+  "^lm_head\\.weight": {
+    "hints": {
+      "quant_method": "gguf",
+      "gguf_type": "Q4_K",
+      "shape": [
+        151936,
+        2048
+      ],
+      "replace": true
+    }
+  }
+}