@@ -2295,24 +2295,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2295
2295
}
2296
2296
}
2297
2297
2298
- // avoid using a host buffer when using mmap
2299
- auto * buft_dev = ggml_backend_buft_get_device(buft);
2300
- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2301
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2302
- if (!cpu_dev) {
2303
- throw std::runtime_error("no CPU backend found");
2304
- }
2305
- buft = ggml_backend_dev_buffer_type(cpu_dev);
2298
+ // avoid using a host buffer when using mmap
2299
+ auto * buft_dev = ggml_backend_buft_get_device(buft);
2300
+ if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2301
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2302
+ if (!cpu_dev) {
2303
+ throw std::runtime_error("no CPU backend found");
2304
+ }
2305
+
2306
+ // If enabled, prefer CPU "extra" (AMX) buffer types for weights on CPU; else use CPU default
2307
+ ggml_backend_buffer_type_t cpu_default_buft = ggml_backend_dev_buffer_type(cpu_dev);
2308
+ const bool prefer_cpu_extra = params.amx_enable_mmap;
2309
+
2310
+ if (!prefer_cpu_extra) {
2311
+ buft = cpu_default_buft;
2312
+ } else {
2313
+ ggml_backend_buffer_type_t chosen = nullptr;
2314
+
2315
+ // Iterate available buffer types, skipping device-host buffer types
2316
+ for (const auto & cur : *buft_list) {
2317
+ ggml_backend_dev_t cur_dev = cur.first;
2318
+ ggml_backend_buffer_type_t cur_buft = cur.second;
2319
+
2320
+ if (cur_dev && cur_buft == ggml_backend_dev_host_buffer_type(cur_dev)) {
2321
+ continue;
2306
2322
}
2307
2323
2308
- if (buft != buft_list->front().second) {
2309
- n_moved_tensors++;
2310
- if (!first_moved_tensor) {
2311
- first_moved_tensor = t_meta;
2312
- first_moved_from_buft = buft_list->front().second;
2313
- first_moved_to_buft = buft;
2324
+ // Prefer CPU "extra" (non-default) if supported for this tensor/op
2325
+ if (cur_dev == cpu_dev && cur_buft != cpu_default_buft) {
2326
+ if (weight_buft_supported(hparams, t_meta, op, cur_buft, cur_dev)) {
2327
+ chosen = cur_buft;
2328
+ break;
2314
2329
}
2315
2330
}
2331
+ }
2332
+
2333
+ buft = chosen ? chosen : cpu_default_buft;
2334
+ }
2335
+ }
2336
+
2337
+
2338
+ // (keep your existing moved-tensors accounting exactly as-is)
2339
+ if (buft != buft_list->front().second) {
2340
+ n_moved_tensors++;
2341
+ if (!first_moved_tensor) {
2342
+ first_moved_tensor = t_meta;
2343
+ first_moved_from_buft = buft_list->front().second;
2344
+ first_moved_to_buft = buft;
2345
+ }
2346
+ }
2347
+
2316
2348
2317
2349
ggml_context * ctx = ctx_for_buft(buft);
2318
2350
@@ -19649,6 +19681,7 @@ llama_model_params llama_model_default_params() {
19649
19681
/*.use_mlock =*/ false,
19650
19682
/*.check_tensors =*/ false,
19651
19683
/*.use_extra_bufts =*/ true,
19684
+ /*.amx_enable_mmap =*/ false,
19652
19685
};
19653
19686
19654
19687
return result;
0 commit comments