diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50a80629a1c..cb87fdea89e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
set(GGML_CUDA_GRAPHS ON)
set(GGML_CUDA_FA ON)
set(GGML_CUDA_COMPRESSION_MODE default)
+set(GGML_OPENVINO ON)
if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
@@ -71,6 +72,11 @@ install(TARGETS ggml-base ${CPU_VARIANTS}
FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
)
+install(TARGETS ggml-openvino
+ RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT OPENVINO
+ LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT OPENVINO
+)
+
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
diff --git a/Modelfile b/Modelfile
new file mode 100644
index 00000000000..eb971c2d47f
--- /dev/null
+++ b/Modelfile
@@ -0,0 +1 @@
+FROM ./Llama-3.2-1B-Instruct.fp16.gguf
diff --git a/docs/openvino.md b/docs/openvino.md
new file mode 100644
index 00000000000..329d40d816e
--- /dev/null
+++ b/docs/openvino.md
@@ -0,0 +1,100 @@
+# OpenVINO Backend in Ollama
+
+OpenVINO is a high-performance AI inference toolkit to optimize performance on Intel CPUs, Intel integrated and discrete GPUs, and NPUs. This branch contains the OpenVINO backend for Ollama. OpenVINO converts the GGML compute graph to OpenVINO IR and accelerates inference on Intel AI PCs.
+
+# Instructions to build and run OpenVINO Backend
+
+## Prerequisites
+
+- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
+- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
+- Git, CMake, and Ninja software tools are needed for building.
+```bash
+ sudo apt-get update
+ sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+```
+
+## Install OpenVINO Runtime
+
+- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
+
+
+📦 Click to expand OpenVINO 2025.2 installation commands on Linux
+
+
+```bash
+export OPENVINO_VERSION_MAJOR=2025.2
+export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
+sudo apt-get update
+sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+sudo mkdir -p /opt/intel
+wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
+tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
+sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
+cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
+sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+source /opt/intel/openvino/setupvars.sh
+```
+
+
+- Verify OpenVINO is initialized properly
+```bash
+echo $OpenVINO_DIR
+```
+
+## Build Ollama with OpenVINO Backend
+
+### Clone Ollama
+
+Clone the OpenVINO-enabled Ollama fork:
+
+```bash
+git clone https://github.com/ynimmaga/ollama.git
+cd ollama
+git switch poc_openvino_backend
+```
+
+### Build GGML OpenVINO Backend and Add to the Library path
+
+```bash
+ mkdir build && cd build
+cmake .. -DGGML_OPENVINO=ON -DBUILD_SHARED_LIBS=ON
+make -j8
+export LD_LIBRARY_PATH=$PWD/lib/ollama:$LD_LIBRARY_PATH
+export CGO_LDFLAGS="-L$INTEL_OPENVINO_DIR/runtime/lib/intel64"
+```
+### Build Ollama
+
+```bash
+cd $ollama_root
+go clean -cache
+go mod tidy
+go build .
+```
+
+## Download models for testing:
+
+```bash
+# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
+wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
+ -O Llama-3.2-1B-Instruct.fp16.gguf
+```
+
+## Create Modelfile and add the below text:
+```bash
+FROM ./Llama-3.2-1B-Instruct.fp16.gguf
+```
+## Start Ollama server and run inference
+
+```bash
+cd $ollama_root
+./ollama serve
+```
+
+Open another terminal, create, and run Ollama model
+```bash
+./ollama create llama3.2-1b-f16 -f Modelfile
+./ollama run llama3.2-1b-f16
+```
diff --git a/go.mod b/go.mod
index 46e7f433fd1..1ab8a1b1ae2 100644
--- a/go.mod
+++ b/go.mod
@@ -5,7 +5,6 @@ go 1.24.0
require (
github.com/containerd/console v1.0.3
github.com/gin-gonic/gin v1.10.0
- github.com/golang/protobuf v1.5.4 // indirect
github.com/google/uuid v1.6.0
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
@@ -30,46 +29,48 @@ require (
require (
github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
+ github.com/bytedance/sonic v1.11.6 // indirect
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/chewxy/hm v1.0.0 // indirect
github.com/chewxy/math32 v1.11.0 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
- github.com/gogo/protobuf v1.3.2 // indirect
- github.com/google/flatbuffers v24.3.25+incompatible // indirect
- github.com/kr/text v0.2.0 // indirect
- github.com/pkg/errors v0.9.1 // indirect
- github.com/pmezard/go-difflib v1.0.0 // indirect
- github.com/rivo/uniseg v0.2.0 // indirect
- github.com/xtgo/set v1.0.0 // indirect
- go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
- golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
- gorgonia.org/vecf32 v0.9.0 // indirect
- gorgonia.org/vecf64 v0.9.0 // indirect
-)
-
-require (
- github.com/bytedance/sonic v1.11.6 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
- github.com/gin-contrib/cors v1.7.2
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.20.0 // indirect
github.com/goccy/go-json v0.10.2 // indirect
+ github.com/gogo/protobuf v1.3.2 // indirect
+ github.com/golang/protobuf v1.5.4 // indirect
+ github.com/google/flatbuffers v24.3.25+incompatible // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
+ github.com/kr/text v0.2.0 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
+ github.com/pkg/errors v0.9.1 // indirect
+ github.com/pmezard/go-difflib v1.0.0 // indirect
+ github.com/rivo/uniseg v0.2.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
+ github.com/xtgo/set v1.0.0 // indirect
+ go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
golang.org/x/arch v0.8.0 // indirect
+ golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+ gopkg.in/yaml.v3 v3.0.1 // indirect
+ gorgonia.org/vecf32 v0.9.0 // indirect
+ gorgonia.org/vecf64 v0.9.0 // indirect
+)
+
+require (
+ github.com/gin-contrib/cors v1.7.2
golang.org/x/crypto v0.36.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
golang.org/x/net v0.38.0 // indirect
@@ -77,5 +78,4 @@ require (
golang.org/x/term v0.30.0
golang.org/x/text v0.23.0
google.golang.org/protobuf v1.34.1
- gopkg.in/yaml.v3 v3.0.1 // indirect
)
diff --git a/llama/llama-openvino.cpp b/llama/llama-openvino.cpp
new file mode 100644
index 00000000000..26d643fc81b
--- /dev/null
+++ b/llama/llama-openvino.cpp
@@ -0,0 +1,11 @@
+#include "ggml.h"
+#include "ggml-openvino.h"
+
+extern "C" {
+ void force_link_openvino() {
+ struct ggml_backend* b = ggml_backend_openvino_init(0);
+ if (b) {
+ ggml_backend_free(b);
+ }
+ }
+}
diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h
index a0a660bff88..ff1ce26517a 100644
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -178,8 +178,8 @@ extern "C" {
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
-
- enum llama_flash_attn_type {
+
+ enum llama_flash_attn_type {
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
@@ -314,6 +314,9 @@ extern "C" {
enum llama_attention_type attention_type; // attention type to use for embeddings
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
+ int backend_type;
+ int device_index;
+
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
index 53a5e3a9bef..f3baa460e3b 100644
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -7,10 +7,13 @@
#include "llama-mmap.h"
#include "llama-model.h"
+#include "ggml-openvino.h"
+
#include
#include
#include
#include
+#include
//
// llama_context
@@ -144,7 +147,7 @@ llama_context::llama_context(
}
backends.emplace_back(backend);
}
-
+
// add ACCEL backends (such as BLAS)
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -157,12 +160,37 @@ llama_context::llama_context(
}
}
+ // add OpenVINO backend if requested
+ if (params.backend_type == GGML_BACKEND_OPENVINO) {
+ std::cout << "In params openvino backend type" << std::endl;
+ //ggml_backend_t backend_ov = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_OPENVINO, nullptr);
+ ggml_backend_t backend_ov = ggml_backend_openvino_init(0);
+ std::cout << "After init by type" << std::endl;
+ if (backend_ov == nullptr) {
+ std::cout << "Backend is null ptr" << std::endl;
+ throw std::runtime_error("failed to initialize OpenVINO backend");
+ }
+ backends.emplace_back(backend_ov);
+ } else {
+ // default: CPU
+ std::cout << "Not in params openvino backend" << std::endl;
+ backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+ if (backend_cpu == nullptr) {
+ throw std::runtime_error("failed to initialize CPU backend");
+ }
+ backends.emplace_back(backend_cpu);
+ }
+
+ std::cout << "After openvino backend if else" << std::endl;
+
+ /*
// add CPU backend
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (backend_cpu == nullptr) {
throw std::runtime_error("failed to initialize CPU backend");
}
backends.emplace_back(backend_cpu);
+ */
// create a list of the set_n_threads functions in the backends
for (auto & backend : backends) {
@@ -2274,6 +2302,8 @@ llama_context_params llama_context_default_params() {
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
/*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
+ /*.backend_type =*/ GGML_BACKEND_DEVICE_TYPE_OPENVINO,
+ /*.device_index =*/ 0,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f,
diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp
index a24853c63ad..96fef078f5f 100644
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -1093,7 +1093,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
- //cb(inp->tokens, "inp_tokens", -1);
+ cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens;
@@ -1141,6 +1141,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
+ cb(cur, "inp_pos", -1);
ggml_set_input(cur);
res->add_input(std::move(inp));
@@ -1176,6 +1177,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
auto & cur = inp->out_ids;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+ cb(cur, "inp_out_ids", -1);
ggml_set_input(cur);
res->add_input(std::move(inp));
@@ -1420,6 +1422,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+ cb(inp->kq_mask, "KQ_mask", -1);
ggml_set_input(inp->kq_mask);
inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
@@ -1466,7 +1469,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo_b) {
- //cb(cur, "kqv_wo", il);
+ cb(cur, "kqv_wo", il);
}
if (wo_b) {
@@ -1496,6 +1499,7 @@ static std::unique_ptr build_attn_inp_kv_impl(
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+ ggml_set_name(inp->self_kq_mask, "KQ_mask");
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1622,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo_b) {
- //cb(cur, "kqv_wo", il);
+ cb(cur, "kqv_wo", il);
}
if (wo_b) {
@@ -1677,7 +1681,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo_b) {
- //cb(cur, "kqv_wo", il);
+ cb(cur, "kqv_wo", il);
}
if (wo_b) {
@@ -1704,6 +1708,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+ ggml_set_name(inp->self_kq_mask, "KQ_mask");
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1718,6 +1723,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+ ggml_set_name(inp->self_kq_mask_swa, "KQ_mask_swa");
ggml_set_input(inp->self_kq_mask_swa);
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
diff --git a/llama/llama.cpp/src/llama.cpp b/llama/llama.cpp/src/llama.cpp
index d821a96a02a..27803dee5e3 100644
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -197,6 +197,7 @@ static struct llama_model * llama_model_load_from_file_impl(
switch (ggml_backend_dev_type(dev)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
+ case GGML_BACKEND_DEVICE_TYPE_OPENVINO:
// skip CPU backends since they are handled separately
break;
diff --git a/llama/llama.go b/llama/llama.go
index c995b3ead29..07a15f0597d 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -10,6 +10,11 @@ package llama
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
#cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
+#cgo LDFLAGS: -lopenvino -L${SRCDIR}/../build/lib/ollama -lggml-openvino -lstdc++
+
+#ifdef __cplusplus
+extern "C" {
+#endif
#include
#include "ggml.h"
@@ -20,8 +25,26 @@ package llama
#include "sampling_ext.h"
+#ifdef __cplusplus
+}
+#endif
+
+// C++ only
+#ifdef __cplusplus
+#include "ggml-openvino.hpp"
+#endif
+
extern bool llamaProgressCallback(float progress, void *user_data);
extern void llamaLog(int level, char* text, void* user_data);
+
+void force_link_openvino();
+typedef int ggml_backend_type;
+
+#define GGML_BACKEND_CPU 0
+#define GGML_BACKEND_CUDA 1
+#define GGML_BACKEND_OPENCL 2
+#define GGML_BACKEND_OPENVINO 4
+
*/
import "C"
@@ -60,6 +83,7 @@ func llamaLog(level C.int, text *C.char, _ unsafe.Pointer) {
func BackendInit() {
ggml.OnceLoad()
+ C.force_link_openvino()
C.llama_backend_init()
}
@@ -108,6 +132,9 @@ func GetModelArch(modelPath string) (string, error) {
type ContextParams struct {
c C.struct_llama_context_params
+ BackendType C.ggml_backend_type
+ DeviceIndex int
+
}
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
@@ -126,7 +153,8 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
- return ContextParams{c: params}
+ return ContextParams{c: params, BackendType: C.GGML_BACKEND_OPENVINO, DeviceIndex: 0}
+
}
// kvCacheTypeFromStr converts a string cache type to the corresponding GGML type value
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 88078d77980..f424784f75d 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -488,6 +488,8 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
gpuLayers++
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
slog.Info("offloading output layer to ACCEL")
+ case C.GGML_BACKEND_DEVICE_TYPE_OPENVINO:
+ slog.Info("offloading output layer to OPENVINO")
}
slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(b.layers)+1))
diff --git a/ml/backend/ggml/ggml/.rsync-filter b/ml/backend/ggml/ggml/.rsync-filter
index 449ec9e5d0b..48a5a93566a 100644
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -20,6 +20,7 @@ include /src/ggml-cuda/vendors/
include /src/ggml-cuda/template-instances/
include /src/ggml-hip/
include /src/ggml-metal/
+include /src/ggml-openvino/
include src/ggml-vulkan/
include src/ggml-vulkan/vulkan-shaders
include CMakeLists.txt
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 094fc3c82c1..351cd755043 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -3,6 +3,8 @@
#include "ggml.h"
#include "ggml-alloc.h"
+#define GGML_BACKEND_OPENVINO 4
+
#ifdef GGML_BACKEND_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BACKEND_BUILD
@@ -135,7 +137,8 @@ extern "C" {
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
- GGML_BACKEND_DEVICE_TYPE_ACCEL
+ GGML_BACKEND_DEVICE_TYPE_ACCEL,
+ GGML_BACKEND_DEVICE_TYPE_OPENVINO
};
// functionality supported by the device
diff --git a/ml/backend/ggml/ggml/include/ggml-openvino.h b/ml/backend/ggml/ggml/include/ggml-openvino.h
new file mode 100644
index 00000000000..151c48d40d0
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-openvino.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_OPENVINO_NAME "OPENVINO"
+#define GGML_OPENVINO_MAX_DEVICES 16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU
+// and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(void);
+
+GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
+// GGML_BACKEND_API void ggml_backend_openvino_get_device_description(int device, char * description,
+// size_t description_size);
+// GGML_BACKEND_API void ggml_backend_openvino_get_device_memory(int device, size_t * free, size_t * total);
+
+// GGML_BACKEND_API bool ggml_backend_openvino_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_openvino_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
+
+struct ggml_openvino_device_info {
+ int device_count;
+
+ struct openvino_device_info {
+ int cc; // compute capability
+ int nsm; // number of streaming multiprocessors
+ size_t smpb; // max. shared memory per block
+ size_t smpbo; // max. shared memory per block (with opt-in)
+ bool vmm; // virtual memory support
+ size_t vmm_granularity; // granularity of virtual memory
+ size_t total_vram;
+ };
+
+ openvino_device_info devices[GGML_OPENVINO_MAX_DEVICES] = {};
+
+ std::array default_tensor_split = {};
+};
+
+const ggml_openvino_device_info & ggml_openvino_info();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt
index aefe43bdd51..ce0b3ddd882 100644
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -331,7 +331,7 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
- ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+ #ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
# Many of these features are optional so we build versions with popular
@@ -390,6 +390,8 @@ ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
+ggml_add_backend(OpenVINO)
+target_compile_definitions(ggml-openvino PRIVATE GGML_USE_OPENVINO)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $ $)
diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
index 3a855ab2ef0..6d44b362828 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -69,6 +69,10 @@
#include "ggml-cann.h"
#endif
+#ifdef GGML_USE_OPENVINO
+#include "ggml-openvino.h"
+#endif
+
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
@@ -220,6 +224,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
+#ifdef GGML_USE_OPENVINO
+ register_backend(ggml_backend_openvino_reg());
+#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif
@@ -417,7 +424,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
}
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_OPENVINO);
if (!dev) {
return nullptr;
}
@@ -617,6 +624,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
+ ggml_backend_load_best("openvino", silent, dir_path);
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
if (backend_path) {
diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp
index 0b757af5946..28d0db2e65f 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1653,7 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new_ext(
bool alloc_buffers) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
- GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
+ //GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
diff --git a/ml/backend/ggml/ggml/src/ggml-openvino/.clang-format b/ml/backend/ggml/ggml/src/ggml-openvino/.clang-format
new file mode 100644
index 00000000000..63dc2c472a9
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-openvino/.clang-format
@@ -0,0 +1,143 @@
+---
+# Override root .clang-format
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+ReferenceAlignment: Left
+PointerAlignment: Left
+Cpp11BracedListStyle: true
+AccessModifierOffset: -4
+BinPackArguments: false
+BreakBeforeBraces: Attach
+IndentCaseBlocks: false
+IndentCaseLabels: false
+
+Language: Cpp
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: Left
+AlignConsecutiveBitFields: AcrossComments
+AlignConsecutiveMacros: AcrossComments
+# AlignConsecutiveShortCaseStatements: AcrossComments
+AlignEscapedNewlines: Left # LeftWithLastLine
+AlignOperands: Align
+AlignTrailingComments:
+ Kind: Always
+ OverEmptyLines: 1
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+# BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Never
+BreakBeforeBinaryOperators: None
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: false
+# BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+# BreakFunctionDefinitionParameters: false
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+# BreakTemplateDeclarations: Yes
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+DerivePointerAlignment: false
+DisableFormat: false
+EmptyLineBeforeAccessModifier: Leave
+EmptyLineAfterAccessModifier: Never
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks: Regroup
+IncludeCategories:
+ - Regex: '^<.*\.h>'
+ Priority: 1
+ SortPriority: 0
+ - Regex: '^<.*'
+ Priority: 2
+ SortPriority: 0
+ - Regex: '.*'
+ Priority: 3
+ SortPriority: 0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+InsertBraces: true # NOTE: may lead to incorrect formatting
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: CurrentLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+QualifierAlignment: Left
+#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
+RawStringFormats:
+ - Language: Cpp
+ Delimiters:
+ - cc
+ - CC
+ - cpp
+ - Cpp
+ - CPP
+ - 'c++'
+ - 'C++'
+ CanonicalDelimiter: ''
+ReflowComments: false # IndentOnly
+SeparateDefinitionBlocks: Always
+SortIncludes: CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: Never
+SpacesInLineCommentPrefix:
+ Minimum: 1
+ Maximum: -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard: c++17
+TabWidth: 4
+UseTab: Never
+WhitespaceSensitiveMacros: ['STRINGIZE']
+...
diff --git a/ml/backend/ggml/ggml/src/ggml-openvino/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-openvino/CMakeLists.txt
new file mode 100644
index 00000000000..54d693027ec
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-openvino/CMakeLists.txt
@@ -0,0 +1,19 @@
+find_package(OpenVINO REQUIRED)
+
+file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
+file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")
+
+ggml_add_backend_library(ggml-openvino
+ ${GGML_SOURCES_OPENVINO}
+ ${GGML_HEADERS_OPENVINO}
+)
+
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime tbb)
+
+if (GGML_OPENVINO)
+ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+ elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
+ else()
+ message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+endif()
diff --git a/ml/backend/ggml/ggml/src/ggml-openvino/ggml-decoder.cpp b/ml/backend/ggml/ggml/src/ggml-openvino/ggml-decoder.cpp
new file mode 100644
index 00000000000..751fa192a42
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -0,0 +1,818 @@
+#include "ggml-decoder.h"
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include