diff --git a/.github/workflows/ci-pr-checks.yaml b/.github/workflows/ci-pr-checks.yaml
index 12f9d82..ef07f3f 100644
--- a/.github/workflows/ci-pr-checks.yaml
+++ b/.github/workflows/ci-pr-checks.yaml
@@ -25,7 +25,9 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y pkg-config python3-dev python3-pip
-          make install-dependencies
+          make download-tokenizer 
+          make download-zmq
+          make install-python-deps
           pip3 install transformers --break-system-packages
 
       - name: Run lint checks
diff --git a/.gitignore b/.gitignore
index d684889..8f8339b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ pkg/dataset/.llm-d
 pkg/llm-d-inference-sim/tests-tmp/
 pkg/llm-d-inference-sim/.llm-d/
 .llm-d/
+.venv
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 6b3625d..aec21ec 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,8 @@ IMAGE_REGISTRY ?= ghcr.io/llm-d
 IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME)
 SIM_TAG ?= dev
 IMG = $(IMAGE_TAG_BASE):$(SIM_TAG)
+POD_IP ?= pod
+export POD_IP
 
 ifeq ($(TARGETOS),darwin)
 ifeq ($(TARGETARCH),amd64)
@@ -60,20 +62,61 @@ export PKG_CONFIG_PATH=/usr/lib/pkgconfig
 PYTHON_VERSION := 3.12
 
 # Unified Python configuration detection. This block runs once.
-PYTHON_CONFIG ?= $(shell command -v python$(PYTHON_VERSION)-config || command -v python3-config)
+# It prioritizes python-config, then pkg-config, for reliability.
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+    # macOS: Find Homebrew's python-config script for the most reliable flags.
+    BREW_PREFIX := $(shell command -v brew >/dev/null 2>&1 && brew --prefix python@$(PYTHON_VERSION) 2>/dev/null)
+    PYTHON_CONFIG := $(BREW_PREFIX)/bin/python$(PYTHON_VERSION)-config
+    ifneq ($(shell $(PYTHON_CONFIG) --cflags 2>/dev/null),)
+        PYTHON_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags)
+        # Use --ldflags --embed to get all necessary flags for linking
+        PYTHON_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed)
+        PYTHON_LIBS :=
+    else
+        $(error "Could not execute 'python$(PYTHON_VERSION)-config' from Homebrew. Please ensure Python is installed correctly with: 'brew install python@$(PYTHON_VERSION)'")
+    endif
+else ifeq ($(UNAME_S),Linux)
+    # Linux: Use standard system tools to find flags.
+    PYTHON_CONFIG := $(shell command -v python$(PYTHON_VERSION)-config || command -v python3-config)
+    ifneq ($(shell $(PYTHON_CONFIG) --cflags 2>/dev/null),)
+		# Use python-config if available and correct
+        PYTHON_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags)
+        PYTHON_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed)
+        PYTHON_LIBS :=
+    else ifneq ($(shell pkg-config --cflags python-$(PYTHON_VERSION) 2>/dev/null),)
+        # Fallback to pkg-config
+        PYTHON_CFLAGS := $(shell pkg-config --cflags python-$(PYTHON_VERSION))
+        PYTHON_LDFLAGS := $(shell pkg-config --libs python-$(PYTHON_VERSION))
+        PYTHON_LIBS :=
+    else
+        $(error "Python $(PYTHON_VERSION) development headers not found. Please install with: 'sudo apt install python$(PYTHON_VERSION)-dev' or 'sudo dnf install python$(PYTHON_VERSION)-devel'")
+    endif
+else
+    $(error "Unsupported OS: $(UNAME_S)")
+endif
+
+# Final CGO flags with all dependencies
+CGO_CFLAGS_FINAL := $(PYTHON_CFLAGS) -Ilib
+CGO_LDFLAGS_FINAL := $(PYTHON_LDFLAGS) $(PYTHON_LIBS) -Llib -ltokenizers -ldl -lm
+
+VENV_DIR    ?= $(shell pwd)/.venv
+VENV_BIN    := $(VENV_DIR)/bin
+VENV_SRC  	:= $(VENV_DIR)/python
 
-CGO_CFLAGS     := $(shell $(PYTHON_CONFIG) --cflags --embed)
-CGO_LDFLAGS    := $(shell $(PYTHON_CONFIG) --ldflags --embed)
+PYTHON_EXE := $(shell command -v python$(PYTHON_VERSION) || command -v python3)
 
 GOMODCACHE := $(shell go env GOMODCACHE)
 KV_CACHE_MGR_VERSION := $(shell go list -m -f '{{.Version}}' github.com/llm-d/llm-d-kv-cache-manager)
 KV_CACHE_MGR_PATH := $(GOMODCACHE)/github.com/llm-d/llm-d-kv-cache-manager@$(KV_CACHE_MGR_VERSION)/pkg/preprocessing/chat_completions
-export PYTHONPATH := $(KV_CACHE_MGR_PATH):$(PYTHONPATH)
 
-CPATH := $(PYTHON_INCLUDE):$(CPATH)
+# Common environment variables for Go tests and builds
+export CGO_ENABLED=1
+export CGO_CFLAGS=$(CGO_CFLAGS_FINAL)
+export CGO_LDFLAGS=$(CGO_LDFLAGS_FINAL)
+export PYTHONPATH=$(VENV_SRC):$(VENV_DIR)/lib/python$(PYTHON_VERSION)/site-packages
 
 GO_LDFLAGS := -extldflags '-L$(shell pwd)/lib $(LDFLAGS) $(CGO_LDFLAGS)'
-CGO_ENABLED=1
 TOKENIZER_LIB = lib/libtokenizers.a
 # Extract TOKENIZER_VERSION from Dockerfile
 TOKENIZER_VERSION := $(shell grep '^ARG TOKENIZER_VERSION=' Dockerfile | cut -d'=' -f2)
@@ -84,7 +127,11 @@ $(TOKENIZER_LIB):
 	## Download the HuggingFace tokenizer bindings.
 	@echo "Downloading HuggingFace tokenizer bindings for version $(TOKENIZER_VERSION)..."
 	mkdir -p lib
-	curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TOKENIZER_ARCH).tar.gz | tar -xz -C lib
+	if [ "$(TARGETOS)" = "darwin" ] && [ "$(TARGETARCH)" = "amd64" ]; then \
+		curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-x86_64.tar.gz | tar -xz -C lib; \
+	else \
+		curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib; \
+	fi
 	ranlib lib/*.a
 
 ##@ Development
@@ -101,13 +148,18 @@ format: ## Format Go source files
 	@gofmt -l -w $(SRC)
 
 .PHONY: test
-test: $(GINKGO) install-dependencies ## Run tests
-	@printf "\033[33;1m==== Running tests ====\033[0m\n"
-ifdef GINKGO_FOCUS
-	CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r -- -ginkgo.v -ginkgo.focus="$(GINKGO_FOCUS)"
-else
-	CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r $(TEST_PKG)
-endif
+test: download-tokenizer install-python-deps ## Run unit tests
+	@printf "\033[33;1m==== Running unit tests ====\033[0m\n"
+	if [ -n "$(GINKGO_FOCUS)" ] && [ -z "$(GINKGO_FOCUS_PKG)" ]; then \
+		echo "Error: GINKGO_FOCUS is defined without GINKGO_FOCUS_PKG. Both required or neither."; \
+		exit 1; \
+	elif [ -n "$(GINKGO_FOCUS)$(GINKGO_FOCUS_PKG)" ]; then \
+		echo "Running specific tests"; \
+		go test -v $(GINKGO_FOCUS_PKG) $(if $(GINKGO_FOCUS),-ginkgo.focus="$(GINKGO_FOCUS)",); \
+	else \
+		echo "Running all tests"; \
+		go test -v ./pkg/...; \
+	fi 
 
 .PHONY: post-deploy-test
 post-deploy-test: ## Run post deployment tests
@@ -122,10 +174,15 @@ lint: $(GOLANGCI_LINT) ## Run lint
 ##@ Build
 
 .PHONY: build
-build: check-go install-dependencies
+build: check-go download-tokenizer install-python-deps download-zmq
 	@printf "\033[33;1m==== Building ====\033[0m\n"
 	CGO_CFLAGS="$(CGO_CFLAGS)" go build -ldflags="$(GO_LDFLAGS)" -o $(LOCALBIN)/$(PROJECT_NAME) cmd/$(PROJECT_NAME)/main.go
 
+.PHONY: run
+run: install-python-deps # build ## Run the application locally
+	@printf "\033[33;1m==== Running application ====\033[0m\n"
+	. $(VENV_DIR)/bin/activate && ./bin/$(PROJECT_NAME) $(ARGS)
+
 ##@ Container Build/Push
 
 .PHONY:	image-build
@@ -234,43 +291,81 @@ print-project-name: ## Print the current project name
 install-hooks: ## Install git hooks
 	git config core.hooksPath hooks
 
+.PHONY: detect-python
+detect-python: ## Detects Python and prints the configuration.
+	@printf "\033[33;1m==== Python Configuration ====\033[0m\n"
+	@if [ -z "$(PYTHON_EXE)" ]; then \
+		echo "ERROR: Python 3 not found in PATH."; \
+		exit 1; \
+	fi
+	@# Verify the version of the found python executable using its exit code
+	@if ! $(PYTHON_EXE) -c "import sys; sys.exit(0 if sys.version_info[:2] == ($(shell echo $(PYTHON_VERSION) | cut -d. -f1), $(shell echo $(PYTHON_VERSION) | cut -d. -f2)) else 1)"; then \
+		echo "ERROR: Found Python at '$(PYTHON_EXE)' but it is not version $(PYTHON_VERSION)."; \
+		echo "Please ensure 'python$(PYTHON_VERSION)' or a compatible 'python3' is in your PATH."; \
+		exit 1; \
+	fi
+	@echo "Python executable: $(PYTHON_EXE) ($$($(PYTHON_EXE) --version))"
+	@echo "Python CFLAGS:     $(PYTHON_CFLAGS)"
+	@echo "Python LDFLAGS:    $(PYTHON_LDFLAGS)"
+	@if [ -z "$(PYTHON_CFLAGS)" ]; then \
+		echo "ERROR: Python development headers not found. See installation instructions above."; \
+		exit 1; \
+	fi
+	@printf "\033[33;1m==============================\033[0m\n"
+
+.PHONY: install-python-deps
+install-python-deps: detect-python ## Sets up the Python virtual environment and installs dependencies.
+	@printf "\033[33;1m==== Setting up Python virtual environment in $(VENV_DIR) ====\033[0m\n"
+	@if [ ! -f "$(VENV_BIN)/pip" ]; then \
+		echo "Creating virtual environment..."; \
+		$(PYTHON_EXE) -m venv $(VENV_DIR) || { \
+			echo "ERROR: Failed to create virtual environment."; \
+			echo "Your Python installation may be missing the 'venv' module."; \
+			echo "Try: 'sudo apt install python$(PYTHON_VERSION)-venv' or 'sudo dnf install python$(PYTHON_VERSION)-devel'"; \
+			exit 1; \
+		}; \
+		mkdir -p $(VENV_SRC); \
+	fi
+	@echo "Upgrading pip and installing dependencies..."
+	@$(VENV_BIN)/pip install --upgrade pip
+	cp $(KV_CACHE_MGR_PATH)/requirements.txt $(VENV_SRC)/
+	cp $(KV_CACHE_MGR_PATH)/render_jinja_template_wrapper.py $(VENV_SRC)/
+	chmod u+w $(VENV_SRC)/*
+	@$(VENV_BIN)/pip install -r $(VENV_SRC)/requirements.txt
+	@echo "Verifying transformers installation..."
+	@$(VENV_BIN)/python -c "import transformers; print('✅ Transformers version ' + transformers.__version__ + ' installed.')" || { \
+		echo "ERROR: transformers library not properly installed in venv."; \
+		exit 1; \
+	}
+
 ##@ ZMQ Setup
 
-.PHONY: install-dependencies
-install-dependencies: download-tokenizer ## Install development dependencies based on OS/ARCH
-	@echo "Checking and installing development dependencies..."
-	@if [ "$(TARGETOS)" = "linux" ]; then \
-	  if [ -x "$$(command -v apt)" ]; then \
-	    if ! dpkg -s libzmq3-dev >/dev/null 2>&1 || ! dpkg -s g++ >/dev/null 2>&1; then \
-	      echo "Installing dependencies with apt..."; \
-	      sudo apt-get update && sudo apt-get install -y libzmq3-dev g++; \
-	    else \
-	      echo "✅ ZMQ and g++ are already installed."; \
-	    fi; \
-	  elif [ -x "$$(command -v dnf)" ]; then \
-	    if ! dnf -q list installed zeromq-devel >/dev/null 2>&1 || ! dnf -q list installed gcc-c++ >/dev/null 2>&1; then \
-	      echo "Installing dependencies with dnf..."; \
-	      sudo dnf install -y zeromq-devel gcc-c++; \
+.PHONY: download-zmq
+download-zmq: ## Install ZMQ dependencies based on OS/ARCH
+	@echo "Checking if ZMQ is already installed..."
+	@if pkg-config --exists libzmq; then \
+	  echo "✅ ZMQ is already installed."; \
+	else \
+	  echo "Installing ZMQ dependencies..."; \
+	  if [ "$(TARGETOS)" = "linux" ]; then \
+	    if [ -x "$$(command -v apt)" ]; then \
+	      apt update && apt install -y libzmq3-dev; \
+	    elif [ -x "$$(command -v dnf)" ]; then \
+	      dnf install -y zeromq-devel; \
 	    else \
-	      echo "✅ ZMQ and gcc-c++ are already installed."; \
+	      echo "Unsupported Linux package manager. Install libzmq manually."; \
+	      exit 1; \
 	    fi; \
-	  else \
-	    echo "Unsupported Linux package manager. Install libzmq and g++/gcc-c++ manually."; \
-	    exit 1; \
-	  fi; \
-	elif [ "$(TARGETOS)" = "darwin" ]; then \
-	  if [ -x "$$(command -v brew)" ]; then \
-	    if ! brew list zeromq pkg-config >/dev/null 2>&1; then \
-	      echo "Installing dependencies with brew..."; \
-	      brew install zeromq pkg-config; \
+	  elif [ "$(TARGETOS)" = "darwin" ]; then \
+	    if [ -x "$$(command -v brew)" ]; then \
+	      brew install zeromq; \
 	    else \
-	      echo "✅ ZeroMQ and pkgconf are already installed."; \
+	      echo "Homebrew is not installed and is required to install zeromq. Install it from https://brew.sh/"; \
+	      exit 1; \
 	    fi; \
 	  else \
-	    echo "Homebrew is not installed and is required to install zeromq. Install it from https://brew.sh/"; \
+	    echo "Unsupported OS: $(TARGETOS). Install libzmq manually - check https://zeromq.org/download/ for guidance."; \
 	    exit 1; \
 	  fi; \
-	else \
-	  echo "Unsupported OS: $(TARGETOS). Install development dependencies manually."; \
-	  exit 1; \
+	  echo "✅ ZMQ dependencies installed."; \
 	fi
diff --git a/README.md b/README.md
index 610eefd..a837943 100644
--- a/README.md
+++ b/README.md
@@ -332,7 +332,7 @@ make build
 ### Running
 To run the vLLM simulator in a standalone test environment, run:
 ```bash
-./bin/llm-d-inference-sim --model my_model --port 8000
+make run ARGS='--model="Qwen/Qwen2.5-1.5B-Instruct" --port 8000 -v=4'
 ```
 
 ## Kubernetes testing
diff --git a/pkg/kv-cache/kv_cache.go b/pkg/kv-cache/kv_cache.go
index a87c7b7..ae6da91 100644
--- a/pkg/kv-cache/kv_cache.go
+++ b/pkg/kv-cache/kv_cache.go
@@ -23,7 +23,6 @@ import (
 	"github.com/go-logr/logr"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
-	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
 )
@@ -49,6 +48,7 @@ func NewKVCacheHelper(config *common.Configuration, logger logr.Logger, usageCha
 	if err != nil {
 		return nil, fmt.Errorf("failed to create block cache: %w", err)
 	}
+
 	return &KVCacheHelper{
 		tokenizer:       tokenizer,
 		tokensProcessor: tokensProcessor,
@@ -71,18 +71,16 @@ func (h *KVCacheHelper) Activate() {
 	h.blockCache.activate()
 }
 
-func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest) error {
+// OnRequestStart called when request received, simulates KV-cache block management
+// Returns number of tokens found in the cache.
+func (h *KVCacheHelper) OnRequestStart(prompt, modelName, requestID string) (int, error) {
 	h.logger.V(logging.TRACE).Info("KV cache - process request")
 
-	prompt := vllmReq.GetPrompt()
-	modelName := vllmReq.GetModel()
-	requestID := vllmReq.GetRequestID()
-
 	// tokenize the input
 	tokens, _, err := h.tokenizer.Encode(prompt, modelName)
 	if err != nil {
 		h.logger.Error(err, "prompt tokenization failed")
-		return err
+		return 0, err
 	}
 
 	// get block keys
@@ -95,8 +93,7 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest
 	}
 
 	nBlocksAlreadyInCache, err := h.blockCache.startRequest(requestID, blockHashes)
-	vllmReq.SetNumberOfCachedPromptTokens(nBlocksAlreadyInCache * h.blockSize)
-	return err
+	return nBlocksAlreadyInCache * h.blockSize, err
 }
 
 func (h *KVCacheHelper) OnRequestEnd(requestID string) error {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
index f097853..ab650b3 100644
--- a/pkg/llm-d-inference-sim/simulator.go
+++ b/pkg/llm-d-inference-sim/simulator.go
@@ -39,6 +39,7 @@ import (
 	kvcache "github.com/llm-d/llm-d-inference-sim/pkg/kv-cache"
 	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 	vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api"
+	preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
 )
 
@@ -53,6 +54,7 @@ const (
 	requestIDHeader = "X-Request-Id"
 	podNameEnv      = "POD_NAME"
 	podNsEnv        = "POD_NAMESPACE"
+	envHFToken      = "HF_TOKEN"
 )
 
 type loraUsageState int
@@ -212,6 +214,11 @@ type VllmSimulator struct {
 	queueCapacity int
 	// a channel for incoming requests
 	newRequests chan *openaiserverapi.CompletionReqCtx
+
+	// chat template for converting /chat/completions request content to a plain string
+	chatTemplate string
+	// parameters for the chat template
+	chatTemplateKWArgs map[string]interface{}
 }
 
 // New creates a new VllmSimulator instance with the given logger
@@ -355,6 +362,24 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
 	}
 
 	if s.config.EnableKVCache {
+		// initialize chat template, only in case kv cache enabled
+		templateReq := preprocessing.FetchChatTemplateRequest{
+			Model: s.config.Model,
+			Token: os.Getenv(envHFToken),
+		}
+
+		chatTemplatingProcessor := preprocessing.NewChatTemplatingProcessor()
+		if err := chatTemplatingProcessor.Initialize(); err != nil {
+			return fmt.Errorf("failed to initialize chat-templating processor: %w", err)
+		}
+
+		s.chatTemplate, s.chatTemplateKWArgs, err = chatTemplatingProcessor.FetchChatTemplate(ctx, templateReq)
+		if err != nil {
+			s.logger.Error(err, "failed to get chat template")
+			return err
+		}
+		s.logger.V(logging.DEBUG).Info("Chat template loaded", "template", s.chatTemplate, "params", s.chatTemplateKWArgs)
+
 		s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.config, s.logger, s.metrics.kvCacheUsageChan, s.tokenizer)
 		if err != nil {
 			return err
@@ -765,3 +790,37 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
 
 	return nil
 }
+
+func (s *VllmSimulator) getPromptForKVCache(reqCtx *openaiserverapi.CompletionReqCtx) (string, error) {
+	if reqCtx.IsChatCompletion {
+		renderReq := preprocessing.RenderJinjaTemplateRequest{
+			Conversations:             make([]preprocessing.ChatMessage, 0),
+			Tools:                     make([]interface{}, 0),
+			Documents:                 make([]interface{}, 0),
+			ReturnAssistantTokensMask: false,
+			ContinueFinalMessage:      false,
+			AddGenerationPrompt:       false,
+			ChatTemplate:              s.chatTemplate,
+			ChatTemplateKWArgs:        s.chatTemplateKWArgs,
+		}
+		// Convert messages to the format expected by the renderer
+		for _, msg := range reqCtx.CompletionReq.GetMessages() {
+			renderReq.Conversations = append(renderReq.Conversations, preprocessing.ChatMessage{
+				Role:    msg.Role,
+				Content: msg.Content.Raw,
+			})
+		}
+
+		// Don't use vllmReq.GetModel() - it may include LoRA's name.
+		// This call requires the base model name instead.
+		prompt, err := s.tokenizer.RenderChatTemplate(s.config.Model, &renderReq)
+		if err != nil {
+			s.logger.Error(err, "failed to render template")
+			return "", err
+		}
+		s.logger.Info("Convert prompt", "rendered", prompt)
+		return prompt, nil
+	}
+
+	return reqCtx.CompletionReq.GetPrompt(), nil
+}
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
index 481d185..5a248db 100644
--- a/pkg/llm-d-inference-sim/worker.go
+++ b/pkg/llm-d-inference-sim/worker.go
@@ -89,12 +89,23 @@ func (s *VllmSimulator) processRequestAsync(reqCtx *openaiserverapi.CompletionRe
 			"metrics.lorasChan")
 	}
 
-	if s.config.EnableKVCache && !reqCtx.IsChatCompletion {
-		// kv cache is currently supported for /completion API only
-		if err := s.kvcacheHelper.OnRequestStart(req); err != nil {
+	if s.config.EnableKVCache {
+		prompt, err := s.getPromptForKVCache(reqCtx)
+
+		if err != nil {
+			s.logger.Error(err, "failed to render template")
+			s.sendCompletionError(reqCtx.HTTPReqCtx,
+				openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil),
+				false)
+			return
+		}
+
+		if numOfCachedTokens, err := s.kvcacheHelper.OnRequestStart(prompt, req.GetModel(), req.GetRequestID()); err != nil {
 			s.sendCompletionError(reqCtx.HTTPReqCtx,
 				openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil),
 				false)
+		} else {
+			req.SetNumberOfCachedPromptTokens(numOfCachedTokens)
 		}
 	}
 
diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go
index 80bc4cf..ed06556 100644
--- a/pkg/openai-server-api/request.go
+++ b/pkg/openai-server-api/request.go
@@ -50,6 +50,8 @@ type CompletionRequest interface {
 	SetNumberOfCachedPromptTokens(cachedPromptTokens int)
 	// GetPrompt returns the prompt
 	GetPrompt() string
+	// GetMessages returns request messages (in chat completion)
+	GetMessages() []Message
 	// GetTools returns tools to use (in chat completion)
 	GetTools() []Tool
 	// GetToolChoice returns tool choice (in chat completion)
@@ -243,6 +245,10 @@ func (c *ChatCompletionRequest) GetPrompt() string {
 	return messages
 }
 
+func (c *ChatCompletionRequest) GetMessages() []Message {
+	return c.Messages
+}
+
 func (c *ChatCompletionRequest) GetTools() []Tool {
 	return c.Tools
 }
@@ -335,6 +341,10 @@ func (t *TextCompletionRequest) GetPrompt() string {
 	return t.Prompt
 }
 
+func (c *TextCompletionRequest) GetMessages() []Message {
+	return make([]Message, 0)
+}
+
 func (c *TextCompletionRequest) GetTools() []Tool {
 	return nil
 }