From b6795e041ac4fffae2eeddd42f63a12d3f3ef6f4 Mon Sep 17 00:00:00 2001 From: Jakub Gajski Date: Thu, 7 Aug 2025 13:03:20 +0200 Subject: [PATCH] **Add dedicated CPU-only Dockerfile and update documentation for CPU/GPU builds** --- Dockerfile_llamacpp_cpuonly | 95 ++++++++++++++++++++++++++ backends/llamacpp/requirements-cpu.txt | 4 ++ docs/source/backends/llamacpp.md | 45 +++++++++--- 3 files changed, 134 insertions(+), 10 deletions(-) create mode 100644 Dockerfile_llamacpp_cpuonly create mode 100644 backends/llamacpp/requirements-cpu.txt diff --git a/Dockerfile_llamacpp_cpuonly b/Dockerfile_llamacpp_cpuonly new file mode 100644 index 00000000000..805b8905ff9 --- /dev/null +++ b/Dockerfile_llamacpp_cpuonly @@ -0,0 +1,95 @@ +FROM ubuntu:24.04 AS deps + +ARG llamacpp_version=b4827 +ARG llamacpp_native=ON +ARG llamacpp_cpu_arm_arch=native + +WORKDIR /opt/src + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + clang \ + cmake \ + curl \ + git \ + python3-dev \ + libssl-dev \ + pkg-config \ + tar \ + libopenblas-dev \ + libblas-dev \ + liblapack-dev + +ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ +RUN mkdir -p llama.cpp \ + && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ + && cd llama.cpp \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DGGML_NATIVE=${llamacpp_native} \ + -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS \ + -DGGML_BACKEND_BLAS=ON \ + -DBUILD_SHARED_LIBS=ON \ + && cmake --build build --parallel --config Release \ + && cmake --install build + +WORKDIR /app +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y +ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked + +FROM deps AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ + --profile release \ + --package text-generation-router-llamacpp +COPY . . +RUN cargo build \ + --profile release \ + --package text-generation-router-llamacpp --frozen + +FROM ubuntu:24.04 +WORKDIR /app + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + python3-venv \ + python3-pip \ + libopenblas0 \ + libblas3 \ + liblapack3 + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +COPY backends/llamacpp/requirements-cpu.txt requirements.txt +COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py +COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ + +# install torch manually to avoid automatic download of nvidia dependencies (leaner image) +RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \ + torch==2.8.0 \ + && pip3 install --no-cache-dir -r requirements.txt -e gguf-py + +COPY --from=builder /usr/lib/libllama.so /usr/lib/ +COPY --from=builder /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + +ENTRYPOINT ["text-generation-router-llamacpp"] diff --git a/backends/llamacpp/requirements-cpu.txt b/backends/llamacpp/requirements-cpu.txt new file mode 100644 index 00000000000..24a4e32a2d7 --- /dev/null +++ b/backends/llamacpp/requirements-cpu.txt @@ -0,0 +1,4 @@ +transformers[torch]==4.49 +huggingface-hub==0.28.1 +hf-transfer==0.1.9 +# when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly \ No newline at end of file diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md index 5cf0edf0c68..f3f8f2a2b78 100644 --- a/docs/source/backends/llamacpp.md +++ b/docs/source/backends/llamacpp.md @@ -12,7 +12,7 @@ environments. - Full compatibility with GGUF format and all quantization formats (GGUF-related constraints may be mitigated dynamically by on-the-fly generation in future updates) -- Optimized inference on CPU and GPU architectures +- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles - Containerized deployment, eliminating dependency complexity - Seamless interoperability with the Hugging Face ecosystem @@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF]. ## Build Docker image +### For CPU-only inference + For optimal performance, the Docker image is compiled with native CPU instructions by default. As a result, it is strongly recommended to run the container on the same host architecture used during the build process. Efforts are ongoing to improve portability across different systems while preserving high computational efficiency. -To build the Docker image, use the following command: +To build the Docker image fo CPU, use the following command: + +```bash +docker build \ + -t tgi-llamacpp-cpu \ + https://github.com/huggingface/text-generation-inference.git \ + -f Dockerfile_llamacpp_cpuonly +``` + +### For GPU-enabled inference ```bash docker build \ @@ -41,13 +52,13 @@ docker build \ ### Build parameters -| Parameter (with --build-arg) | Description | -| ----------------------------------------- | -------------------------------- | -| `llamacpp_version=bXXXX` | Specific version of llama.cpp | -| `llamacpp_cuda=ON` | Enables CUDA acceleration | -| `llamacpp_native=OFF` | Disable automatic CPU detection | -| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features | -| `cuda_arch=ARCH` | Defines target CUDA architecture | +| Parameter (with --build-arg) | Description | CPU or GPU? | +| ----------------------------------------- | -------------------------------- | ----------- | +| `llamacpp_version=bXXXX` | Specific version of llama.cpp | Both | +| `llamacpp_cuda=ON` | Enables CUDA acceleration | GPU | +| `llamacpp_native=OFF` | Disable automatic CPU detection | Both | +| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features | Both | +| `cuda_arch=ARCH` | Defines target CUDA architecture | GPU | For example, to target Graviton4 when building on another ARM architecture: @@ -61,6 +72,20 @@ docker build \ -f Dockerfile_llamacpp ``` +For example, to target a local CPU without GPU acceleration: + +```bash +docker build \ + -t tgi-llamacpp-cpu \ + --build-arg llamacpp_native=ON \ + https://github.com/huggingface/text-generation-inference.git \ + -f Dockerfile_llamacpp_cpuonly +``` + +As a rule of thumb, if you are not interested in GPU acceleration, +you should build the CPU-only image, which is significantly smaller +(Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB) + ## Run Docker image ### CPU-based inference @@ -70,7 +95,7 @@ docker run \ -p 3000:3000 \ -e "HF_TOKEN=$HF_TOKEN" \ -v "$HOME/models:/app/models" \ - tgi-llamacpp \ + tgi-llamacpp-cpu \ --model-id "Qwen/Qwen2.5-3B-Instruct" ```