ynimmaga · ynimmaga · Sep 5, 2025 · Sep 9, 2025 · Sep 10, 2025 · Oct 1, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,6 +26,7 @@ set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)
 set(GGML_CUDA_FA ON)
 set(GGML_CUDA_COMPRESSION_MODE default)
+set(GGML_OPENVINO ON)
 
 if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
     OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
@@ -71,6 +72,11 @@ install(TARGETS ggml-base ${CPU_VARIANTS}
     FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
 )
 
+install(TARGETS ggml-openvino
+    RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT OPENVINO
+    LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT OPENVINO
+)
+
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)

diff --git a/Modelfile b/Modelfile
@@ -0,0 +1 @@
+FROM ./Llama-3.2-1B-Instruct.fp16.gguf
diff --git a/docs/openvino.md b/docs/openvino.md
@@ -0,0 +1,100 @@
+# OpenVINO Backend in Ollama
+
+OpenVINO is a high-performance AI inference toolkit to optimize performance on Intel CPUs, Intel integrated and discrete GPUs, and NPUs. This branch contains the OpenVINO backend for Ollama. OpenVINO converts the GGML compute graph to OpenVINO IR and accelerates inference on Intel AI PCs.
+
+# Instructions to build and run OpenVINO Backend
+
+## Prerequisites
+
+- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
+- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
+- Git, CMake, and Ninja software tools are needed for building.
+```bash
+  sudo apt-get update
+  sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+```
+
+## Install OpenVINO Runtime
+
+- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)
+
+<details>
+<summary>📦 Click to expand OpenVINO 2025.2 installation commands on Linux</summary>
+<br>
+
+```bash
+export OPENVINO_VERSION_MAJOR=2025.2
+export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
+sudo apt-get update
+sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
+sudo mkdir -p /opt/intel
+wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
+tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
+sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
+cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
+echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
+sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+source /opt/intel/openvino/setupvars.sh
+```
+</details>
+
+- Verify OpenVINO is initialized properly
+```bash
+echo $OpenVINO_DIR
+```
+
+## Build Ollama with OpenVINO Backend
+
+### Clone Ollama
+
+Clone the OpenVINO-enabled Ollama fork:
+
+```bash
+git clone https://github.com/ynimmaga/ollama.git
+cd ollama
+git switch poc_openvino_backend
+```
+
+### Build GGML OpenVINO Backend and Add to the Library path
+
+```bash
+ mkdir build && cd build
+cmake .. -DGGML_OPENVINO=ON -DBUILD_SHARED_LIBS=ON
+make -j8
+export LD_LIBRARY_PATH=$PWD/lib/ollama:$LD_LIBRARY_PATH
+export CGO_LDFLAGS="-L$INTEL_OPENVINO_DIR/runtime/lib/intel64"
+```
+### Build Ollama
+
+```bash
+cd $ollama_root
+go clean -cache
+go mod tidy
+go build .
+```
+
+## Download models for testing:
+
+```bash
+# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
+wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
+     -O Llama-3.2-1B-Instruct.fp16.gguf
+```
+
+## Create Modelfile and add the below text:
+```bash
+FROM ./Llama-3.2-1B-Instruct.fp16.gguf
+```
+## Start Ollama server and run inference
+
+```bash
+cd $ollama_root
+./ollama serve
+```
+
+Open another terminal, create, and run Ollama model
+```bash
+./ollama create llama3.2-1b-f16 -f Modelfile
+./ollama run llama3.2-1b-f16
+```
diff --git a/go.mod b/go.mod
@@ -5,7 +5,6 @@ go 1.24.0
 require (
 	github.com/containerd/console v1.0.3
 	github.com/gin-gonic/gin v1.10.0
-	github.com/golang/protobuf v1.5.4 // indirect
 	github.com/google/uuid v1.6.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
@@ -30,52 +29,53 @@ require (
 
 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
+	github.com/bytedance/sonic v1.11.6 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
 	github.com/chewxy/math32 v1.11.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/gogo/protobuf v1.3.2 // indirect
-	github.com/google/flatbuffers v24.3.25+incompatible // indirect
-	github.com/kr/text v0.2.0 // indirect
-	github.com/pkg/errors v0.9.1 // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/rivo/uniseg v0.2.0 // indirect
-	github.com/xtgo/set v1.0.0 // indirect
-	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
-	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gorgonia.org/vecf32 v0.9.0 // indirect
-	gorgonia.org/vecf64 v0.9.0 // indirect
-)
-
-require (
-	github.com/bytedance/sonic v1.11.6 // indirect
 	github.com/gabriel-vasile/mimetype v1.4.3 // indirect
-	github.com/gin-contrib/cors v1.7.2
 	github.com/gin-contrib/sse v0.1.0 // indirect
 	github.com/go-playground/locales v0.14.1 // indirect
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-playground/validator/v10 v10.20.0 // indirect
 	github.com/goccy/go-json v0.10.2 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/protobuf v1.5.4 // indirect
+	github.com/google/flatbuffers v24.3.25+incompatible // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
+	github.com/kr/text v0.2.0 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
+	github.com/xtgo/set v1.0.0 // indirect
+	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/arch v0.8.0 // indirect
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	gorgonia.org/vecf32 v0.9.0 // indirect
+	gorgonia.org/vecf64 v0.9.0 // indirect
+)
+
+require (
+	github.com/gin-contrib/cors v1.7.2
 	golang.org/x/crypto v0.36.0
 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
 	golang.org/x/net v0.38.0 // indirect
 	golang.org/x/sys v0.31.0
 	golang.org/x/term v0.30.0
 	golang.org/x/text v0.23.0
 	google.golang.org/protobuf v1.34.1
-	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/llama/llama-openvino.cpp b/llama/llama-openvino.cpp
@@ -0,0 +1,11 @@
+#include "ggml.h"
+#include "ggml-openvino.h"
+
+extern "C" {
+    void force_link_openvino() {
+        struct ggml_backend* b = ggml_backend_openvino_init(0);
+        if (b) {
+            ggml_backend_free(b);
+        }
+    }
+}
diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h
@@ -178,8 +178,8 @@ extern "C" {
         LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
         LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
     };
-
-    enum llama_flash_attn_type {
+ 
+     enum llama_flash_attn_type {
         LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
         LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
         LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
@@ -314,6 +314,9 @@ extern "C" {
         enum llama_attention_type    attention_type;    // attention type to use for embeddings
         enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
 
+        int backend_type;
+        int device_index;
+
         // ref: https://github.com/ggml-org/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
         float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model

diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
@@ -7,10 +7,13 @@
 #include "llama-mmap.h"
 #include "llama-model.h"
 
+#include "ggml-openvino.h"
+
 #include <cinttypes>
 #include <cstring>
 #include <limits>
 #include <stdexcept>
+#include <iostream>
 
 //
 // llama_context
@@ -144,7 +147,7 @@ llama_context::llama_context(
             }
             backends.emplace_back(backend);
         }
-
+       
         // add ACCEL backends (such as BLAS)
         for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
             ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -157,12 +160,37 @@ llama_context::llama_context(
             }
         }
 
+        // add OpenVINO backend if requested
+	if (params.backend_type == GGML_BACKEND_OPENVINO) {
+            std::cout << "In params openvino backend type" << std::endl;
+    	    //ggml_backend_t backend_ov = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_OPENVINO, nullptr);
+            ggml_backend_t backend_ov = ggml_backend_openvino_init(0);
+            std::cout << "After init by type" << std::endl;
+    	    if (backend_ov == nullptr) {
+                std::cout << "Backend is null ptr" << std::endl;
+                throw std::runtime_error("failed to initialize OpenVINO backend");
+            }
+            backends.emplace_back(backend_ov);
+        } else {
+    	// default: CPU
+            std::cout << "Not in params openvino backend" << std::endl;
+    	    backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    	    if (backend_cpu == nullptr) {
+                 throw std::runtime_error("failed to initialize CPU backend");
+            }
+            backends.emplace_back(backend_cpu);
+        }
+
+        std::cout << "After openvino backend if else" << std::endl;
+
+        /*
         // add CPU backend
         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
         if (backend_cpu == nullptr) {
             throw std::runtime_error("failed to initialize CPU backend");
         }
         backends.emplace_back(backend_cpu);
+        */
 
         // create a list of the set_n_threads functions in the backends
         for (auto & backend : backends) {
@@ -2274,6 +2302,8 @@ llama_context_params llama_context_default_params() {
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
         /*.flash_attn_type             =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
+        /*.backend_type                =*/ GGML_BACKEND_DEVICE_TYPE_OPENVINO,
+        /*.device_index                =*/ 0,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,

diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp
@@ -1093,7 +1093,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 
     if (ubatch.token) {
         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp->tokens, "inp_tokens", -1);
+        cb(inp->tokens, "inp_tokens", -1);
         ggml_set_input(inp->tokens);
         res->t_tokens = inp->tokens;
 
@@ -1141,6 +1141,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
     auto & cur = inp->pos;
 
     cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
+    cb(cur, "inp_pos", -1);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1176,6 +1177,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
     auto & cur = inp->out_ids;
 
     cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+    cb(cur, "inp_out_ids", -1);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1420,6 +1422,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
 
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
     inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    cb(inp->kq_mask, "KQ_mask", -1);
     ggml_set_input(inp->kq_mask);
 
     inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
@@ -1466,7 +1469,7 @@ ggml_tensor * llm_graph_context::build_attn(
     }
 
     if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
     }
 
     if (wo_b) {
@@ -1496,6 +1499,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        ggml_set_name(inp->self_kq_mask, "KQ_mask");
         ggml_set_input(inp->self_kq_mask);
 
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1622,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_attn(
     }
 
     if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
     }
 
     if (wo_b) {
@@ -1677,7 +1681,7 @@ ggml_tensor * llm_graph_context::build_attn(
     }
 
     if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
     }
 
     if (wo_b) {
@@ -1704,6 +1708,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        ggml_set_name(inp->self_kq_mask, "KQ_mask");
         ggml_set_input(inp->self_kq_mask);
 
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1718,6 +1723,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        ggml_set_name(inp->self_kq_mask_swa, "KQ_mask_swa");
         ggml_set_input(inp->self_kq_mask_swa);
 
         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;

diff --git a/llama/llama.cpp/src/llama.cpp b/llama/llama.cpp/src/llama.cpp
@@ -197,6 +197,7 @@ static struct llama_model * llama_model_load_from_file_impl(
             switch (ggml_backend_dev_type(dev)) {
                 case GGML_BACKEND_DEVICE_TYPE_CPU:
                 case GGML_BACKEND_DEVICE_TYPE_ACCEL:
+                case GGML_BACKEND_DEVICE_TYPE_OPENVINO:
                     // skip CPU backends since they are handled separately
                     break;