Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 96 additions & 22 deletions tools/mllm-llm-benchmark/main.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#include <string>
#include <fstream>
#include <vector>
#include <sstream>
#include <thread>
#include <chrono>
#include <algorithm> // For std::transform

#include <mllm/mllm.hpp>
#include <mllm/utils/Argparse.hpp>
Expand All @@ -16,6 +19,14 @@

#include "models/All.hpp"

#ifndef MLLM_GIT_COMMIT_HASH
#define MLLM_GIT_COMMIT_HASH unknown
#endif

#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)


MLLM_MAIN({
auto& help = mllm::Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_name = mllm::Argparse::add<std::string>("-n|--model_name").help("Model name");
Expand All @@ -25,8 +36,19 @@ MLLM_MAIN({
auto& pp = mllm::Argparse::add<std::string>("-pp|--prompt_length").help("Prompt length");
auto& tg = mllm::Argparse::add<std::string>("-tg|--test_generation_length").help("Test Generation length");
auto& cache_length = mllm::Argparse::add<int32_t>("-cl|--cache_length").help("Cache length");

// New CLI Arguments
auto& runs = mllm::Argparse::add<int32_t>("-r|--runs").help("Number of benchmark runs").def(3);
auto& cooldown_s = mllm::Argparse::add<int32_t>("-cs|--cooldown_s").help("Cooldown time between runs in seconds").def(5);
auto& output_csv = mllm::Argparse::add<std::string>("-oc|--output_csv").help("Output results to a CSV file").def("");
auto& schema_version = mllm::Argparse::add<int32_t>("-sv|--schema_version").help("Schema version for output format").def(1);
auto& kv_dtype_bytes = mllm::Argparse::add<int32_t>("-kv|--kv_dtype_bytes").help("KV cache data type bytes (1: int8, 2: fp16, 4: fp32)").def(4);

mllm::Argparse::parse(argc, argv);

mllm::Context::instance().setCpuOpThreads(num_threads.get());
mllm::setMaximumNumThreads((uint32_t)num_threads.get());

// Print Build Version
mllm::print("MLLM Build Version :", STRINGIFY(MLLM_GIT_COMMIT_HASH));

Expand Down Expand Up @@ -58,6 +80,25 @@ MLLM_MAIN({
auto benchmark = createBenchmark(model_name.get());
MLLM_RT_ASSERT(benchmark != nullptr);


// Validate runs early to avoid huge reserve() when negative values cast to size_t.
int R = runs.get();
if (R <= 0) {
mllm::print("[ERROR] --runs must be > 0, got:", R);
return 1;
}

// Open file stream
std::ofstream csv_file;
if (!output_csv.get().empty()) {
csv_file.open(output_csv.get());
if (!csv_file.is_open()) {
mllm::print("[ERROR] Failed to open --output_csv:", output_csv.get());
return 1;
}
csv_file << "schema_version,git_commit,arch,model_name,pp,tg,ttft_ms,prefill_speed,decode_speed,prefill_ms,decode_ms_per_tok,kv_est_bytes_pp,kv_est_bytes_final\n";
}

// Print Model Info
mllm::print("Model Info");
benchmark->init(config_path.get(), model_path.get(), cache_length.get());
Expand Down Expand Up @@ -92,7 +133,7 @@ MLLM_MAIN({
for (size_t i = 0; i < pp_values.size(); ++i) { pp_tg_pairs.emplace_back(pp_values[i], tg_values[i]); }
}

// Actual run for 3 turns and gives avg results. Each turn will sleep for 5 seconds to let the SoC or GPU/NPU cool down.
// Actual run for configurable number of turns
mllm::print("\n========================================");
mllm::print("Starting Benchmark Tests");
mllm::print("========================================\n");
Expand All @@ -106,30 +147,33 @@ MLLM_MAIN({

// Storage for results
std::vector<BenchmarkTemplateResult> results;
results.reserve(3);
results.reserve(static_cast<size_t>(R));

for (int i = 0; i < 3; ++i) {
mllm::print(" Run", i + 1, "of 3...");
for (int i = 0; i < R; ++i) {
mllm::print(" Run", i + 1, "of", R, "...");

// Clear cache before each run
benchmark->clear();

// Run benchmark
auto result = benchmark->run(pp, tg);
results.push_back(result);

mllm::print(" TTFT :", result.ttft, "ms");
mllm::print(" Prefill Speed:", result.prefill_speed, "tokens/s");
mllm::print(" Decode Speed :", result.decode_speed, "tokens/s");

// Sleep for 5 seconds between runs to cool down
if (i < 2) {
mllm::print(" Cooling down for 5 seconds...");
std::this_thread::sleep_for(std::chrono::seconds(5));
float prefill_ms = (result.prefill_speed > 0.0f) ? (pp / result.prefill_speed) * 1000.0f : 0.0f;
float decode_ms_per_tok = (result.decode_speed > 0.0f) ? (1.0f / result.decode_speed) * 1000.0f : 0.0f;
mllm::print(" Prefill Latency :", prefill_ms, "ms");
mllm::print(" Decode Latency :", decode_ms_per_tok, "ms");

int cool = cooldown_s.get();
if (i + 1 < R && cool > 0) {
mllm::print(" Cooling down for", cool, "seconds...");
std::this_thread::sleep_for(std::chrono::seconds(cool));
}
}

// Calculate average results
float denom = (R > 0) ? static_cast<float>(R) : 1.0f;
float avg_ttft = 0.0f;
float avg_prefill_speed = 0.0f;
float avg_decode_speed = 0.0f;
Expand All @@ -140,20 +184,50 @@ MLLM_MAIN({
avg_decode_speed += result.decode_speed;
}

avg_ttft /= 3.0f;
avg_prefill_speed /= 3.0f;
avg_decode_speed /= 3.0f;

// Print average results
mllm::print("\n========== Average Results ==========");
mllm::print("Configuration: PP=", pp, " TG=", tg);
mllm::print("Average TTFT :", avg_ttft, "ms");
mllm::print("Average Prefill Speed:", avg_prefill_speed, "tokens/s");
mllm::print("Average Decode Speed :", avg_decode_speed, "tokens/s");
mllm::print("=====================================\n");
avg_ttft /= denom;
avg_prefill_speed /= denom;
avg_decode_speed /= denom;

float avg_prefill_ms = (avg_prefill_speed > 0.0f) ? (pp / avg_prefill_speed) * 1000.0f : 0.0f;
float avg_decode_ms_per_tok = (avg_decode_speed > 0.0f) ? (1.0f / avg_decode_speed) * 1000.0f : 0.0f;

// Rough KV cache estimate (bytes)
double kv_est_bytes_pp = 0.0;
double kv_est_bytes_final = 0.0;
if (auto info = benchmark->kvEstimateInfo(); info.has_value()) {
const int32_t bytes_per = kv_dtype_bytes.get(); // 1/2/4
// LLaMA-like KV: 2 * n_layers * n_kv_heads * head_dim * seq_len * bytes
kv_est_bytes_pp = 2.0 * info->num_layers * info->num_kv_heads * info->head_dim * (double)pp * bytes_per;
kv_est_bytes_final = 2.0 * info->num_layers * info->num_kv_heads * info->head_dim * (double)(pp + tg) * bytes_per;
}

// Prepare one line output (avg)
std::stringstream ss;
ss << schema_version.get() << ","
<< STRINGIFY(MLLM_GIT_COMMIT_HASH) << ","
<< mllm::cpu::CURRENT_ARCH_STRING << ","
<< model_name.get() << ","
<< pp << ","
<< tg << ","
<< avg_ttft << ","
<< avg_prefill_speed << ","
<< avg_decode_speed << ","
<< avg_prefill_ms << ","
<< avg_decode_ms_per_tok << ","
<< kv_est_bytes_pp << ","
<< kv_est_bytes_final;

if (csv_file.is_open()) {
csv_file << ss.str() << std::endl;
}
}

mllm::print("\n========================================");
mllm::print("Benchmark Tests Completed");
mllm::print("========================================");

//close file stream
if (csv_file.is_open()) {
csv_file.close();
}
})
25 changes: 20 additions & 5 deletions tools/mllm-llm-benchmark/models/All.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,35 @@

#include <memory>
#include <algorithm>
#include <string>
#include <cctype> // for std::tolower

#include "Qwen3_W4A32_KAI.hpp"
#include "BenchmarkTemplate.hpp"
#include "Qwen3_W4A32_KAI.hpp"
#include "Llama.hpp"

std::shared_ptr<BenchmarkTemplate> createBenchmark(const std::string& model_name) {
inline std::shared_ptr<BenchmarkTemplate> createBenchmark(const std::string& model_name) {
auto tolower = [](const std::string& str) {
std::string result = str;
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
// NOTE: std::tolower expects unsigned char cast to avoid UB for negative char values.
std::transform(result.begin(), result.end(), result.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
return result;
};

auto normalized_model_name = tolower(model_name);
if (normalized_model_name.find("qwen3") != std::string::npos && normalized_model_name.find("w4a32") != std::string::npos
&& normalized_model_name.find("kai") != std::string::npos) {

if (normalized_model_name.find("qwen3") != std::string::npos &&
normalized_model_name.find("w4a32") != std::string::npos &&
normalized_model_name.find("kai") != std::string::npos) {
return std::make_shared<Qwen3_W4A32_KAI_Benchmark>();
}

if (normalized_model_name.find("llama") != std::string::npos ||
normalized_model_name.find("tinyllama") != std::string::npos ||
normalized_model_name.find("tiny_llama") != std::string::npos) {
return std::make_shared<Llama_Benchmark>();
}

return nullptr;
}
12 changes: 12 additions & 0 deletions tools/mllm-llm-benchmark/models/BenchmarkTemplate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#pragma once

#include <string>
#include <optional>
#include <cstdint>

/**
* @brief Benchmark result structure
Expand All @@ -13,6 +15,12 @@ struct BenchmarkTemplateResult {
float decode_speed; ///< Decode phase speed in tokens/s
};

struct KVCacheEstimateInfo {
int32_t num_layers = 0;
int32_t num_kv_heads = 0;
int32_t head_dim = 0; // hidden_size / num_attention_heads
};

/**
* @brief Base class for benchmark templates
*
Expand Down Expand Up @@ -58,4 +66,8 @@ class BenchmarkTemplate {
* @return Test results
*/
virtual BenchmarkTemplateResult run(int32_t pp, int32_t tg) = 0;

// Optional: provide info for KV cache size estimation.
// If a model does not support it, return std::nullopt.
virtual std::optional<KVCacheEstimateInfo> kvEstimateInfo() const { return std::nullopt; }
};
Loading