From b6795e041ac4fffae2eeddd42f63a12d3f3ef6f4 Mon Sep 17 00:00:00 2001
From: Jakub Gajski <jakub.gajski@gmail.com>
Date: Thu, 7 Aug 2025 13:03:20 +0200
Subject: [PATCH] **Add dedicated CPU-only Dockerfile and update documentation
 for CPU/GPU builds**

---
 Dockerfile_llamacpp_cpuonly            | 95 ++++++++++++++++++++++++++
 backends/llamacpp/requirements-cpu.txt |  4 ++
 docs/source/backends/llamacpp.md       | 45 +++++++++---
 3 files changed, 134 insertions(+), 10 deletions(-)
 create mode 100644 Dockerfile_llamacpp_cpuonly
 create mode 100644 backends/llamacpp/requirements-cpu.txt

diff --git a/Dockerfile_llamacpp_cpuonly b/Dockerfile_llamacpp_cpuonly
new file mode 100644
index 00000000000..805b8905ff9
--- /dev/null
+++ b/Dockerfile_llamacpp_cpuonly
@@ -0,0 +1,95 @@
+FROM ubuntu:24.04 AS deps
+
+ARG llamacpp_version=b4827
+ARG llamacpp_native=ON
+ARG llamacpp_cpu_arm_arch=native
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar \
+    libopenblas-dev \
+    libblas-dev \
+    liblapack-dev
+
+ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN mkdir -p llama.cpp \
+ && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
+ && cd llama.cpp \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DGGML_NATIVE=${llamacpp_native} \
+    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+    -DGGML_BLAS=ON \
+    -DGGML_BLAS_VENDOR=OpenBLAS \
+    -DGGML_BACKEND_BLAS=ON \
+    -DBUILD_SHARED_LIBS=ON \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release \
+    --package text-generation-router-llamacpp --frozen
+
+FROM ubuntu:24.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    python3-venv \
+    python3-pip \
+    libopenblas0 \
+    libblas3 \
+    liblapack3
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements-cpu.txt requirements.txt
+COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
+COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
+
+# install torch manually to avoid automatic download of nvidia dependencies (leaner image)
+RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
+    torch==2.8.0 \
+    && pip3 install --no-cache-dir -r requirements.txt -e gguf-py
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
diff --git a/backends/llamacpp/requirements-cpu.txt b/backends/llamacpp/requirements-cpu.txt
new file mode 100644
index 00000000000..24a4e32a2d7
--- /dev/null
+++ b/backends/llamacpp/requirements-cpu.txt
@@ -0,0 +1,4 @@
+transformers[torch]==4.49
+huggingface-hub==0.28.1
+hf-transfer==0.1.9
+# when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly
\ No newline at end of file
diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md
index 5cf0edf0c68..f3f8f2a2b78 100644
--- a/docs/source/backends/llamacpp.md
+++ b/docs/source/backends/llamacpp.md
@@ -12,7 +12,7 @@ environments.
 - Full compatibility with GGUF format and all quantization formats
   (GGUF-related constraints may be mitigated dynamically by on-the-fly
   generation in future updates)
-- Optimized inference on CPU and GPU architectures
+- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles
 - Containerized deployment, eliminating dependency complexity
 - Seamless interoperability with the Hugging Face ecosystem
 
@@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF].
 
 ## Build Docker image
 
+### For CPU-only inference
+
 For optimal performance, the Docker image is compiled with native CPU
 instructions by default. As a result, it is strongly recommended to run
 the container on the same host architecture used during the build
 process. Efforts are ongoing to improve portability across different
 systems while preserving high computational efficiency.
 
-To build the Docker image, use the following command:
+To build the Docker image fo CPU, use the following command:
+
+```bash
+docker build \
+    -t tgi-llamacpp-cpu \
+    https://github.com/huggingface/text-generation-inference.git \
+    -f Dockerfile_llamacpp_cpuonly
+```
+
+### For GPU-enabled inference
 
 ```bash
 docker build \
@@ -41,13 +52,13 @@ docker build \
 
 ### Build parameters
 
-| Parameter (with --build-arg)              | Description                      |
-| ----------------------------------------- | -------------------------------- |
-| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    |
-| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        |
-| `llamacpp_native=OFF`                     | Disable automatic CPU detection  |
-| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    |
-| `cuda_arch=ARCH`                          | Defines target CUDA architecture |
+| Parameter (with --build-arg)              | Description                      | CPU or GPU? |
+| ----------------------------------------- | -------------------------------- | ----------- |
+| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    | Both        |
+| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        | GPU         |
+| `llamacpp_native=OFF`                     | Disable automatic CPU detection  | Both        |
+| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    | Both        |
+| `cuda_arch=ARCH`                          | Defines target CUDA architecture | GPU         |
 
 For example, to target Graviton4 when building on another ARM
 architecture:
@@ -61,6 +72,20 @@ docker build \
     -f Dockerfile_llamacpp
 ```
 
+For example, to target a local CPU without GPU acceleration:
+
+```bash
+docker build \                            
+    -t tgi-llamacpp-cpu \
+    --build-arg llamacpp_native=ON \
+    https://github.com/huggingface/text-generation-inference.git \
+    -f Dockerfile_llamacpp_cpuonly
+```
+
+As a rule of thumb, if you are not interested in GPU acceleration,
+you should build the CPU-only image, which is significantly smaller 
+(Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB)
+
 ## Run Docker image
 
 ### CPU-based inference
@@ -70,7 +95,7 @@ docker run \
     -p 3000:3000 \
     -e "HF_TOKEN=$HF_TOKEN" \
     -v "$HOME/models:/app/models" \
-    tgi-llamacpp \
+    tgi-llamacpp-cpu \
     --model-id "Qwen/Qwen2.5-3B-Instruct"
 ```