Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2642,6 +2642,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.no_extra_bufts = true;
}
).set_env("LLAMA_ARG_NO_REPACK"));
add_opt(common_arg(
{"--no-host"},
"bypass host buffer allowing extra buffers to be used",
[](common_params & params) {
params.no_host = true;
}
).set_env("LLAMA_ARG_NO_HOST"));
add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE",
string_format(
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
mparams.no_host = params.no_host;

if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ struct common_params {
bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
bool no_host = false; // bypass host buffer allowing extra buffers to be used

bool single_turn = false; // single turn chat conversation

Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
bool no_host; // bypass host buffer allowing extra buffers to be used
};

// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
Expand Down
17 changes: 10 additions & 7 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
}

// CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
buft_list_t buft_list;

// add ACCEL buffer types
Expand All @@ -331,11 +331,13 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
// generally, this will be done using the first device in the list
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
// function of the device to determine if it would benefit from being stored in a host buffer
for (auto * dev : devices) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
if (buft) {
buft_list.emplace_back(dev, buft);
break;
if (!no_host) {
for (auto * dev : devices) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
if (buft) {
buft_list.emplace_back(dev, buft);
break;
}
}
}

Expand Down Expand Up @@ -2062,7 +2064,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");

// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
// add CPU buffer types as a fallback
Expand Down Expand Up @@ -19651,6 +19653,7 @@ llama_model_params llama_model_default_params() {
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
/*.no_host =*/ false,
};

return result;
Expand Down
38 changes: 35 additions & 3 deletions tools/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ struct cmd_params {
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
std::vector<bool> no_op_offload;
std::vector<bool> no_host;
ggml_numa_strategy numa;
int reps;
ggml_sched_priority prio;
Expand Down Expand Up @@ -394,6 +395,7 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ { true },
/* embeddings */ { false },
/* no_op_offload */ { false },
/* no_host */ { false },
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
Expand Down Expand Up @@ -474,6 +476,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
printf(" (default: disabled)\n");
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" --no-host <0|1> (default: %s)\n",
join(cmd_params_defaults.no_host, ",").c_str());
printf("\n");
printf(
"Multiple values can be given for each parameter by separating them with ','\n"
Expand Down Expand Up @@ -803,6 +807,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<bool>(argv[i], split_delim);
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
} else if (arg == "--no-host") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.no_host.insert(params.no_host.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1024,6 +1035,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.no_op_offload.empty()) {
params.no_op_offload = cmd_params_defaults.no_op_offload;
}
if (params.no_host.empty()) {
params.no_host = cmd_params_defaults.no_host;
}
if (params.n_threads.empty()) {
params.n_threads = cmd_params_defaults.n_threads;
}
Expand Down Expand Up @@ -1065,6 +1079,7 @@ struct cmd_params_instance {
bool use_mmap;
bool embeddings;
bool no_op_offload;
bool no_host;

llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
Expand All @@ -1077,6 +1092,7 @@ struct cmd_params_instance {
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
mparams.no_host = no_host;

if (n_cpu_moe <= 0) {
if (tensor_buft_overrides.empty()) {
Expand Down Expand Up @@ -1159,6 +1175,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mmp : params.use_mmap)
for (const auto & embd : params.embeddings)
for (const auto & nopo : params.no_op_offload)
for (const auto & noh : params.no_host)
for (const auto & nb : params.n_batch)
for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k)
Expand Down Expand Up @@ -1199,6 +1216,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
};
instances.push_back(instance);
}
Expand Down Expand Up @@ -1232,6 +1250,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
};
instances.push_back(instance);
}
Expand Down Expand Up @@ -1265,6 +1284,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
};
instances.push_back(instance);
}
Expand Down Expand Up @@ -1303,6 +1323,7 @@ struct test {
bool use_mmap;
bool embeddings;
bool no_op_offload;
bool no_host;
int n_prompt;
int n_gen;
int n_depth;
Expand Down Expand Up @@ -1339,6 +1360,7 @@ struct test {
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
no_op_offload = inst.no_op_offload;
no_host = inst.no_host;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
n_depth = inst.n_depth;
Expand Down Expand Up @@ -1386,8 +1408,8 @@ struct test {
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
"tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload",
"n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
"stddev_ns", "avg_ts", "stddev_ts"
"no_host", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
};
return fields;
}
Expand All @@ -1402,7 +1424,7 @@ struct test {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
field == "use_mmap" || field == "embeddings") {
field == "use_mmap" || field == "embeddings" || field == "no_host") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
Expand Down Expand Up @@ -1477,6 +1499,7 @@ struct test {
std::to_string(use_mmap),
std::to_string(embeddings),
std::to_string(no_op_offload),
std::to_string(no_host),
std::to_string(n_prompt),
std::to_string(n_gen),
std::to_string(n_depth),
Expand Down Expand Up @@ -1665,6 +1688,9 @@ struct markdown_printer : public printer {
if (field == "no_op_offload") {
return 4;
}
if (field == "no_host") {
return 4;
}

int width = std::max((int) field.length(), 10);

Expand Down Expand Up @@ -1699,6 +1725,9 @@ struct markdown_printer : public printer {
if (field == "no_op_offload") {
return "nopo";
}
if (field == "no_host") {
return "noh";
}
if (field == "devices") {
return "dev";
}
Expand Down Expand Up @@ -1779,6 +1808,9 @@ struct markdown_printer : public printer {
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
fields.emplace_back("no_op_offload");
}
if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) {
fields.emplace_back("no_host");
}
fields.emplace_back("test");
fields.emplace_back("t/s");

Expand Down