Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
set(GGML_CUDA_GRAPHS ON)
set(GGML_CUDA_FA ON)
set(GGML_CUDA_COMPRESSION_MODE default)
set(GGML_OPENVINO ON)

if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
Expand Down Expand Up @@ -71,6 +72,11 @@ install(TARGETS ggml-base ${CPU_VARIANTS}
FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
)

install(TARGETS ggml-openvino
RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT OPENVINO
LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT OPENVINO
)

check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
Expand Down
1 change: 1 addition & 0 deletions Modelfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FROM ./Llama-3.2-1B-Instruct.fp16.gguf
100 changes: 100 additions & 0 deletions docs/openvino.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# OpenVINO Backend in Ollama

OpenVINO is a high-performance AI inference toolkit to optimize performance on Intel CPUs, Intel integrated and discrete GPUs, and NPUs. This branch contains the OpenVINO backend for Ollama. OpenVINO converts the GGML compute graph to OpenVINO IR and accelerates inference on Intel AI PCs.

# Instructions to build and run OpenVINO Backend

## Prerequisites

- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
- Git, CMake, and Ninja software tools are needed for building.
```bash
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
```

## Install OpenVINO Runtime

- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-archive-windows.html)

<details>
<summary>📦 Click to expand OpenVINO 2025.2 installation commands on Linux</summary>
<br>

```bash
export OPENVINO_VERSION_MAJOR=2025.2
export OPENVINO_VERSION_FULL=2025.2.0.19140.c01cd93e24d
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
sudo mkdir -p /opt/intel
wget -O openvino_${OPENVINO_VERSION_MAJOR}.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz
tar -xf openvino_${OPENVINO_VERSION_MAJOR}.tgz
sudo mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
rm openvino_${OPENVINO_VERSION_MAJOR}.tgz
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh && cd -
sudo ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
source /opt/intel/openvino/setupvars.sh
```
</details>

- Verify OpenVINO is initialized properly
```bash
echo $OpenVINO_DIR
```

## Build Ollama with OpenVINO Backend

### Clone Ollama

Clone the OpenVINO-enabled Ollama fork:

```bash
git clone https://github.com/ynimmaga/ollama.git
cd ollama
git switch poc_openvino_backend
```

### Build GGML OpenVINO Backend and Add to the Library path

```bash
mkdir build && cd build
cmake .. -DGGML_OPENVINO=ON -DBUILD_SHARED_LIBS=ON
make -j8
export LD_LIBRARY_PATH=$PWD/lib/ollama:$LD_LIBRARY_PATH
export CGO_LDFLAGS="-L$INTEL_OPENVINO_DIR/runtime/lib/intel64"
```
### Build Ollama

```bash
cd $ollama_root
go clean -cache
go mod tidy
go build .
```

## Download models for testing:

```bash
# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
-O Llama-3.2-1B-Instruct.fp16.gguf
```

## Create Modelfile and add the below text:
```bash
FROM ./Llama-3.2-1B-Instruct.fp16.gguf
```
## Start Ollama server and run inference

```bash
cd $ollama_root
./ollama serve
```

Open another terminal, create, and run Ollama model
```bash
./ollama create llama3.2-1b-f16 -f Modelfile
./ollama run llama3.2-1b-f16
```
36 changes: 18 additions & 18 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ go 1.24.0
require (
github.com/containerd/console v1.0.3
github.com/gin-gonic/gin v1.10.0
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/uuid v1.6.0
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
Expand All @@ -30,52 +29,53 @@ require (

require (
github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
github.com/bytedance/sonic v1.11.6 // indirect
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/chewxy/hm v1.0.0 // indirect
github.com/chewxy/math32 v1.11.0 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/flatbuffers v24.3.25+incompatible // indirect
github.com/kr/text v0.2.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/xtgo/set v1.0.0 // indirect
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
gorgonia.org/vecf32 v0.9.0 // indirect
gorgonia.org/vecf64 v0.9.0 // indirect
)

require (
github.com/bytedance/sonic v1.11.6 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
github.com/gin-contrib/cors v1.7.2
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.20.0 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/flatbuffers v24.3.25+incompatible // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
github.com/xtgo/set v1.0.0 // indirect
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
golang.org/x/arch v0.8.0 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
gorgonia.org/vecf32 v0.9.0 // indirect
gorgonia.org/vecf64 v0.9.0 // indirect
)

require (
github.com/gin-contrib/cors v1.7.2
golang.org/x/crypto v0.36.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/sys v0.31.0
golang.org/x/term v0.30.0
golang.org/x/text v0.23.0
google.golang.org/protobuf v1.34.1
gopkg.in/yaml.v3 v3.0.1 // indirect
)
11 changes: 11 additions & 0 deletions llama/llama-openvino.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "ggml.h"
#include "ggml-openvino.h"

extern "C" {
void force_link_openvino() {
struct ggml_backend* b = ggml_backend_openvino_init(0);
if (b) {
ggml_backend_free(b);
}
}
}
7 changes: 5 additions & 2 deletions llama/llama.cpp/include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@ extern "C" {
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};

enum llama_flash_attn_type {
enum llama_flash_attn_type {
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
Expand Down Expand Up @@ -314,6 +314,9 @@ extern "C" {
enum llama_attention_type attention_type; // attention type to use for embeddings
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention

int backend_type;
int device_index;

// ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
Expand Down
32 changes: 31 additions & 1 deletion llama/llama.cpp/src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
#include "llama-mmap.h"
#include "llama-model.h"

#include "ggml-openvino.h"

#include <cinttypes>
#include <cstring>
#include <limits>
#include <stdexcept>
#include <iostream>

//
// llama_context
Expand Down Expand Up @@ -144,7 +147,7 @@ llama_context::llama_context(
}
backends.emplace_back(backend);
}

// add ACCEL backends (such as BLAS)
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
Expand All @@ -157,12 +160,37 @@ llama_context::llama_context(
}
}

// add OpenVINO backend if requested
if (params.backend_type == GGML_BACKEND_OPENVINO) {
std::cout << "In params openvino backend type" << std::endl;
//ggml_backend_t backend_ov = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_OPENVINO, nullptr);
ggml_backend_t backend_ov = ggml_backend_openvino_init(0);
std::cout << "After init by type" << std::endl;
if (backend_ov == nullptr) {
std::cout << "Backend is null ptr" << std::endl;
throw std::runtime_error("failed to initialize OpenVINO backend");
}
backends.emplace_back(backend_ov);
} else {
// default: CPU
std::cout << "Not in params openvino backend" << std::endl;
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (backend_cpu == nullptr) {
throw std::runtime_error("failed to initialize CPU backend");
}
backends.emplace_back(backend_cpu);
}

std::cout << "After openvino backend if else" << std::endl;

/*
// add CPU backend
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (backend_cpu == nullptr) {
throw std::runtime_error("failed to initialize CPU backend");
}
backends.emplace_back(backend_cpu);
*/

// create a list of the set_n_threads functions in the backends
for (auto & backend : backends) {
Expand Down Expand Up @@ -2274,6 +2302,8 @@ llama_context_params llama_context_default_params() {
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
/*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
/*.backend_type =*/ GGML_BACKEND_DEVICE_TYPE_OPENVINO,
/*.device_index =*/ 0,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f,
Expand Down
14 changes: 10 additions & 4 deletions llama/llama.cpp/src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1093,7 +1093,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {

if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
//cb(inp->tokens, "inp_tokens", -1);
cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens;

Expand Down Expand Up @@ -1141,6 +1141,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
auto & cur = inp->pos;

cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
cb(cur, "inp_pos", -1);
ggml_set_input(cur);

res->add_input(std::move(inp));
Expand Down Expand Up @@ -1176,6 +1177,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
auto & cur = inp->out_ids;

cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
cb(cur, "inp_out_ids", -1);
ggml_set_input(cur);

res->add_input(std::move(inp));
Expand Down Expand Up @@ -1420,6 +1422,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con

// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
cb(inp->kq_mask, "KQ_mask", -1);
ggml_set_input(inp->kq_mask);

inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
Expand Down Expand Up @@ -1466,7 +1469,7 @@ ggml_tensor * llm_graph_context::build_attn(
}

if (wo_b) {
//cb(cur, "kqv_wo", il);
cb(cur, "kqv_wo", il);
}

if (wo_b) {
Expand Down Expand Up @@ -1496,6 +1499,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);

inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
ggml_set_name(inp->self_kq_mask, "KQ_mask");
ggml_set_input(inp->self_kq_mask);

inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
Expand Down Expand Up @@ -1622,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_attn(
}

if (wo_b) {
//cb(cur, "kqv_wo", il);
cb(cur, "kqv_wo", il);
}

if (wo_b) {
Expand Down Expand Up @@ -1677,7 +1681,7 @@ ggml_tensor * llm_graph_context::build_attn(
}

if (wo_b) {
//cb(cur, "kqv_wo", il);
cb(cur, "kqv_wo", il);
}

if (wo_b) {
Expand All @@ -1704,6 +1708,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);

inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
ggml_set_name(inp->self_kq_mask, "KQ_mask");
ggml_set_input(inp->self_kq_mask);

inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
Expand All @@ -1718,6 +1723,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);

inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
ggml_set_name(inp->self_kq_mask_swa, "KQ_mask_swa");
ggml_set_input(inp->self_kq_mask_swa);

inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
Expand Down
1 change: 1 addition & 0 deletions llama/llama.cpp/src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ static struct llama_model * llama_model_load_from_file_impl(
switch (ggml_backend_dev_type(dev)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
case GGML_BACKEND_DEVICE_TYPE_OPENVINO:
// skip CPU backends since they are handled separately
break;

Expand Down
Loading
Loading