From b9675d9df491eb8e7b5c2d656b4909b3cf487272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 9 Aug 2025 07:18:22 +0000 Subject: [PATCH 1/9] Checkpoint from VS Code for coding agent session --- common/arg.cpp | 54 +++++++++++++++++++ common/common.h | 1 + .../speculative-simple/speculative-simple.cpp | 8 +++ examples/speculative/speculative.cpp | 8 +++ tools/server/server.cpp | 7 +++ 5 files changed, 78 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 0f01bb31454a4..2476c20bc713f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2383,6 +2383,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); + add_opt(common_arg( + {"--override-tensor-draft"}, "=,...", + "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { + /* static */ std::map buft_list; + if (buft_list.empty()) { + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + } + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + static std::list buft_overrides; + buft_overrides.push_back(tensor_name); + params.speculative.tensor_buft_overrides.push_back({buft_overridest.back().c_str(), buft_list.at(buffer_type)}); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--cpu-moe", "-cmoe"}, "keep all Mixture of Experts (MoE) weights in the CPU", @@ -2405,6 +2438,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_N_CPU_MOE")); + add_opt(common_arg( + {"--cpu-moe-draft", "-cmoed"}, + "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", + [](common_params & params) { + params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); + add_opt(common_arg( + {"--n-cpu-moe-draft", "-ncmoed"}, "N", + "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model", + [](common_params & params, int value) { + if (value < 0) { + throw std::invalid_argument("invalid value"); + } + for (int i = 0; i < value; ++i) { + static std::list buft_overrides_draft; + buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", diff --git a/common/common.h b/common/common.h index 5eab199af559e..c09509b669e54 100644 --- a/common/common.h +++ b/common/common.h @@ -202,6 +202,7 @@ struct common_params_speculative { float p_split = 0.1f; // speculative decoding split probability float p_min = 0.75f; // minimum speculative decoding probability (greedy) std::vector> replacements; // main to speculative model replacements + std::vector tensor_buft_overrides; ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 722cd7f40f088..74734cf3beaba 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -59,6 +59,14 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; + + // Apply tensor overrides for draft model + if (!params.speculative.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; + } else { + params.tensor_buft_overrides.clear(); + } + common_init_result llama_init_dft = common_init_from_params(params); //model_dft = llama_init_dft.model.get(); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 0adffdb006bcf..bac74d61a40f3 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -85,6 +85,14 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; + + // Apply tensor overrides for draft model + if (!params.speculative.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; + } else { + params.tensor_buft_overrides.clear(); + } + common_init_result llama_init_dft = common_init_from_params(params); model_dft = llama_init_dft.model.get(); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a255d481a4d1c..7e3b399f9abb7 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2011,6 +2011,13 @@ struct server_context { params_dft.cache_type_k = params_base.speculative.cache_type_k; params_dft.cache_type_v = params_base.speculative.cache_type_v; + // Apply tensor overrides for draft model + if (!params_base.speculative.tensor_buft_overrides.empty()) { + params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides; + } else { + params_dft.tensor_buft_overrides.clear(); // ensure no main overrides leak in + } + llama_init_dft = common_init_from_params(params_dft); model_dft = llama_init_dft.model.get(); From f98ee9eee8f9d1c64dd8be4e963933e0f1b3e421 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:18:27 +0000 Subject: [PATCH 2/9] Initial plan From 06f7547c03c5275efa12258359021ed6b472bb7f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:30:04 +0000 Subject: [PATCH 3/9] Fix typo in --override-tensor-draft flag implementation Co-authored-by: CISC <1629204+CISC@users.noreply.github.com> --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 2476c20bc713f..0f462e70f1530 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2412,7 +2412,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } static std::list buft_overrides; buft_overrides.push_back(tensor_name); - params.speculative.tensor_buft_overrides.push_back({buft_overridest.back().c_str(), buft_list.at(buffer_type)}); + params.speculative.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); From e41f1ded83ef16e4a6d1bcef94c3ce6eb7103488 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:32:58 +0000 Subject: [PATCH 4/9] Add null termination for speculative tensor buffer overrides Co-authored-by: CISC <1629204+CISC@users.noreply.github.com> --- common/arg.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 0f462e70f1530..383f40d2a1a01 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -993,6 +993,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.tensor_buft_overrides.push_back({nullptr, nullptr}); } + if (!params.speculative.tensor_buft_overrides.empty()) { + params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s%s\n", From 0d38374d971d0bf07abf187de95d0442e5d58d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 9 Aug 2025 09:51:44 +0200 Subject: [PATCH 5/9] Apply suggestions from code review --- common/arg.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 383f40d2a1a01..e2478d45b4ba8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2400,6 +2400,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } } + for (const auto & override : string_split(value, ',')) { std::string::size_type pos = override.find('='); if (pos == std::string::npos) { @@ -2407,6 +2408,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } std::string tensor_name = override.substr(0, pos); std::string buffer_type = override.substr(pos + 1); + if (buft_list.find(buffer_type) == buft_list.end()) { printf("Available buffer types:\n"); for (const auto & it : buft_list) { @@ -2414,6 +2416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } throw std::invalid_argument("unknown buffer type"); } + // keep strings alive and avoid leaking memory by storing them in a static vector static std::list buft_overrides; buft_overrides.push_back(tensor_name); params.speculative.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); From c87f4b0fe4f11d77d7e2b7849cb14c7800b1d9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 10 Aug 2025 14:13:07 +0200 Subject: [PATCH 6/9] Apply suggestions from code review --- examples/speculative-simple/speculative-simple.cpp | 8 +------- examples/speculative/speculative.cpp | 8 +------- tools/server/server.cpp | 7 +------ 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 74734cf3beaba..a8e53f28eb597 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -59,13 +59,7 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; - - // Apply tensor overrides for draft model - if (!params.speculative.tensor_buft_overrides.empty()) { - params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; - } else { - params.tensor_buft_overrides.clear(); - } + params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; common_init_result llama_init_dft = common_init_from_params(params); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index bac74d61a40f3..8449406a6d27a 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -85,13 +85,7 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; - - // Apply tensor overrides for draft model - if (!params.speculative.tensor_buft_overrides.empty()) { - params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; - } else { - params.tensor_buft_overrides.clear(); - } + params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; common_init_result llama_init_dft = common_init_from_params(params); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 7e3b399f9abb7..34877c1d8304a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2011,12 +2011,7 @@ struct server_context { params_dft.cache_type_k = params_base.speculative.cache_type_k; params_dft.cache_type_v = params_base.speculative.cache_type_v; - // Apply tensor overrides for draft model - if (!params_base.speculative.tensor_buft_overrides.empty()) { - params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides; - } else { - params_dft.tensor_buft_overrides.clear(); // ensure no main overrides leak in - } + params_dft.tensor_buft_overrides = params_base.speculative.tensor_buft_overrides; llama_init_dft = common_init_from_params(params_dft); From 072ab9c700781e730e97d9f17a89fa7da07ee81b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 11 Aug 2025 11:23:48 +0000 Subject: [PATCH 7/9] Extract tensor override parsing logic to common function (addresses @slaren's feedback) Co-authored-by: ggerganov <1991296+ggerganov@users.noreply.github.com> --- common/arg.cpp | 100 ++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 63 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index e2478d45b4ba8..70215822461e8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -749,6 +749,41 @@ std::pair> common_remote_get_content(const std::string & // utils // +// Helper function to parse tensor buffer override strings +static void parse_tensor_buffer_overrides(const std::string & value, std::vector & overrides) { + static std::map buft_list; + if (buft_list.empty()) { + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + } + + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + // keep strings alive and avoid leaking memory by storing them in a static vector + static std::list buft_overrides; + buft_overrides.push_back(tensor_name); + overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); + } +} + struct handle_model_result { bool found_mmproj = false; common_params_model mmproj; @@ -2353,74 +2388,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--override-tensor", "-ot"}, "=,...", "override tensor buffer type", [](common_params & params, const std::string & value) { - /* static */ std::map buft_list; - if (buft_list.empty()) { - // enumerate all the devices and add their buffer types to the list - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - auto * buft = ggml_backend_dev_buffer_type(dev); - if (buft) { - buft_list[ggml_backend_buft_name(buft)] = buft; - } - } - } - - for (const auto & override : string_split(value, ',')) { - std::string::size_type pos = override.find('='); - if (pos == std::string::npos) { - throw std::invalid_argument("invalid value"); - } - std::string tensor_name = override.substr(0, pos); - std::string buffer_type = override.substr(pos + 1); - - if (buft_list.find(buffer_type) == buft_list.end()) { - printf("Available buffer types:\n"); - for (const auto & it : buft_list) { - printf(" %s\n", ggml_backend_buft_name(it.second)); - } - throw std::invalid_argument("unknown buffer type"); - } - // keep strings alive and avoid leaking memory by storing them in a static vector - static std::list buft_overrides; - buft_overrides.push_back(tensor_name); - params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); - } + parse_tensor_buffer_overrides(value, params.tensor_buft_overrides); } )); add_opt(common_arg( {"--override-tensor-draft"}, "=,...", "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { - /* static */ std::map buft_list; - if (buft_list.empty()) { - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - auto * buft = ggml_backend_dev_buffer_type(dev); - if (buft) { - buft_list[ggml_backend_buft_name(buft)] = buft; - } - } - } - - for (const auto & override : string_split(value, ',')) { - std::string::size_type pos = override.find('='); - if (pos == std::string::npos) { - throw std::invalid_argument("invalid value"); - } - std::string tensor_name = override.substr(0, pos); - std::string buffer_type = override.substr(pos + 1); - - if (buft_list.find(buffer_type) == buft_list.end()) { - printf("Available buffer types:\n"); - for (const auto & it : buft_list) { - printf(" %s\n", ggml_backend_buft_name(it.second)); - } - throw std::invalid_argument("unknown buffer type"); - } - // keep strings alive and avoid leaking memory by storing them in a static vector - static std::list buft_overrides; - buft_overrides.push_back(tensor_name); - params.speculative.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); - } + parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( From 7906f01ab7870ca511e82355acc6c71c6246f5d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 11 Aug 2025 19:32:13 +0200 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Diego Devesa --- common/arg.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 70215822461e8..a215405493a27 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -751,14 +751,12 @@ std::pair> common_remote_get_content(const std::string & // Helper function to parse tensor buffer override strings static void parse_tensor_buffer_overrides(const std::string & value, std::vector & overrides) { - static std::map buft_list; - if (buft_list.empty()) { - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - auto * buft = ggml_backend_dev_buffer_type(dev); - if (buft) { - buft_list[ggml_backend_buft_name(buft)] = buft; - } + std::map buft_list; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; } } From d276b459a17ffba4558ea788fb5b2d35ba11135d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 13 Aug 2025 08:57:05 +0200 Subject: [PATCH 9/9] Apply suggestions --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index a215405493a27..066abff1d713c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2390,7 +2390,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( - {"--override-tensor-draft"}, "=,...", + {"--override-tensor-draft", "-otd"}, "=,...", "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides); }