- move chat template related code from worker to simulator

mayabar · mayabar · commit 88d44256bcf1 · 2025-12-15T09:00:29.000+02:00
- fix makefile to be allow a valid python build process

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ pkg/dataset/.llm-d
 pkg/llm-d-inference-sim/tests-tmp/
 pkg/llm-d-inference-sim/.llm-d/
 .llm-d/
+.venv
diff --git a/Makefile b/Makefile
@@ -31,6 +31,8 @@ IMAGE_REGISTRY ?= ghcr.io/llm-d
 IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME)
 SIM_TAG ?= dev
 IMG = $(IMAGE_TAG_BASE):$(SIM_TAG)
+POD_IP ?= pod
+export POD_IP
 
 ifeq ($(TARGETOS),darwin)
 ifeq ($(TARGETARCH),amd64)
@@ -60,20 +62,61 @@ export PKG_CONFIG_PATH=/usr/lib/pkgconfig
 PYTHON_VERSION := 3.12
 
 # Unified Python configuration detection. This block runs once.
-PYTHON_CONFIG ?= $(shell command -v python$(PYTHON_VERSION)-config || command -v python3-config)
+# It prioritizes python-config, then pkg-config, for reliability.
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+    # macOS: Find Homebrew's python-config script for the most reliable flags.
+    BREW_PREFIX := $(shell command -v brew >/dev/null 2>&1 && brew --prefix python@$(PYTHON_VERSION) 2>/dev/null)
+    PYTHON_CONFIG := $(BREW_PREFIX)/bin/python$(PYTHON_VERSION)-config
+    ifneq ($(shell $(PYTHON_CONFIG) --cflags 2>/dev/null),)
+        PYTHON_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags)
+        # Use --ldflags --embed to get all necessary flags for linking
+        PYTHON_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed)
+        PYTHON_LIBS :=
+    else
+        $(error "Could not execute 'python$(PYTHON_VERSION)-config' from Homebrew. Please ensure Python is installed correctly with: 'brew install python@$(PYTHON_VERSION)'")
+    endif
+else ifeq ($(UNAME_S),Linux)
+    # Linux: Use standard system tools to find flags.
+    PYTHON_CONFIG := $(shell command -v python$(PYTHON_VERSION)-config || command -v python3-config)
+    ifneq ($(shell $(PYTHON_CONFIG) --cflags 2>/dev/null),)
+		# Use python-config if available and correct
+        PYTHON_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags)
+        PYTHON_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed)
+        PYTHON_LIBS :=
+    else ifneq ($(shell pkg-config --cflags python-$(PYTHON_VERSION) 2>/dev/null),)
+        # Fallback to pkg-config
+        PYTHON_CFLAGS := $(shell pkg-config --cflags python-$(PYTHON_VERSION))
+        PYTHON_LDFLAGS := $(shell pkg-config --libs python-$(PYTHON_VERSION))
+        PYTHON_LIBS :=
+    else
+        $(error "Python $(PYTHON_VERSION) development headers not found. Please install with: 'sudo apt install python$(PYTHON_VERSION)-dev' or 'sudo dnf install python$(PYTHON_VERSION)-devel'")
+    endif
+else
+    $(error "Unsupported OS: $(UNAME_S)")
+endif
+
+# Final CGO flags with all dependencies
+CGO_CFLAGS_FINAL := $(PYTHON_CFLAGS) -Ilib
+CGO_LDFLAGS_FINAL := $(PYTHON_LDFLAGS) $(PYTHON_LIBS) -Llib -ltokenizers -ldl -lm
+
+VENV_DIR    ?= $(shell pwd)/.venv
+VENV_BIN    := $(VENV_DIR)/bin
+VENV_SRC  	:= $(VENV_DIR)/python
 
-CGO_CFLAGS     := $(shell $(PYTHON_CONFIG) --cflags --embed)
-CGO_LDFLAGS    := $(shell $(PYTHON_CONFIG) --ldflags --embed)
+PYTHON_EXE := $(shell command -v python$(PYTHON_VERSION) || command -v python3)
 
 GOMODCACHE := $(shell go env GOMODCACHE)
 KV_CACHE_MGR_VERSION := $(shell go list -m -f '{{.Version}}' github.com/llm-d/llm-d-kv-cache-manager)
 KV_CACHE_MGR_PATH := $(GOMODCACHE)/github.com/llm-d/llm-d-kv-cache-manager@$(KV_CACHE_MGR_VERSION)/pkg/preprocessing/chat_completions
-export PYTHONPATH := $(KV_CACHE_MGR_PATH):$(PYTHONPATH)
 
-CPATH := $(PYTHON_INCLUDE):$(CPATH)
+# Common environment variables for Go tests and builds
+export CGO_ENABLED=1
+export CGO_CFLAGS=$(CGO_CFLAGS_FINAL)
+export CGO_LDFLAGS=$(CGO_LDFLAGS_FINAL)
+export PYTHONPATH=$(VENV_SRC):$(VENV_DIR)/lib/python$(PYTHON_VERSION)/site-packages
 
 GO_LDFLAGS := -extldflags '-L$(shell pwd)/lib $(LDFLAGS) $(CGO_LDFLAGS)'
-CGO_ENABLED=1
 TOKENIZER_LIB = lib/libtokenizers.a
 # Extract TOKENIZER_VERSION from Dockerfile
 TOKENIZER_VERSION := $(shell grep '^ARG TOKENIZER_VERSION=' Dockerfile | cut -d'=' -f2)
@@ -84,7 +127,11 @@ $(TOKENIZER_LIB):
 	## Download the HuggingFace tokenizer bindings.
 	@echo "Downloading HuggingFace tokenizer bindings for version $(TOKENIZER_VERSION)..."
 	mkdir -p lib
-	curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TOKENIZER_ARCH).tar.gz | tar -xz -C lib
+	if [ "$(TARGETOS)" = "darwin" ] && [ "$(TARGETARCH)" = "amd64" ]; then \
+		curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-x86_64.tar.gz | tar -xz -C lib; \
+	else \
+		curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib; \
+	fi
 	ranlib lib/*.a
 
 ##@ Development
@@ -101,13 +148,18 @@ format: ## Format Go source files
 	@gofmt -l -w $(SRC)
 
 .PHONY: test
-test: $(GINKGO) install-dependencies ## Run tests
-	@printf "\033[33;1m==== Running tests ====\033[0m\n"
-ifdef GINKGO_FOCUS
-	CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r -- -ginkgo.v -ginkgo.focus="$(GINKGO_FOCUS)"
-else
-	CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r $(TEST_PKG)
-endif
+test: download-tokenizer install-python-deps # download-zmq ## Run unit tests
+	@printf "\033[33;1m==== Running unit tests ====\033[0m\n"
+	if [ -n "$(GINKGO_FOCUS)" ] && [ -z "$(GINKGO_FOCUS_PKG)" ]; then \
+		echo "Error: GINKGO_FOCUS is defined without GINKGO_FOCUS_PKG. Both required or neither."; \
+		exit 1; \
+	elif [ -n "$(GINKGO_FOCUS)$(GINKGO_FOCUS_PKG)" ]; then \
+		echo "Running specific tests"; \
+		go test -v $(GINKGO_FOCUS_PKG) $(if $(GINKGO_FOCUS),-ginkgo.focus="$(GINKGO_FOCUS)",); \
+	else \
+		echo "Running all tests"; \
+		go test -v ./pkg/...; \
+	fi 
 
 .PHONY: post-deploy-test
 post-deploy-test: ## Run post deployment tests
@@ -122,10 +174,15 @@ lint: $(GOLANGCI_LINT) ## Run lint
 ##@ Build
 
 .PHONY: build
-build: check-go install-dependencies
+build: check-go download-tokenizer install-python-deps download-zmq
 	@printf "\033[33;1m==== Building ====\033[0m\n"
 	CGO_CFLAGS="$(CGO_CFLAGS)" go build -ldflags="$(GO_LDFLAGS)" -o $(LOCALBIN)/$(PROJECT_NAME) cmd/$(PROJECT_NAME)/main.go
 
+.PHONY: run
+run: install-python-deps # build ## Run the application locally
+	@printf "\033[33;1m==== Running application ====\033[0m\n"
+	. $(VENV_DIR)/bin/activate && ./bin/$(PROJECT_NAME) $(ARGS)
+
 ##@ Container Build/Push
 
 .PHONY:	image-build
@@ -234,43 +291,81 @@ print-project-name: ## Print the current project name
 install-hooks: ## Install git hooks
 	git config core.hooksPath hooks
 
+.PHONY: detect-python
+detect-python: ## Detects Python and prints the configuration.
+	@printf "\033[33;1m==== Python Configuration ====\033[0m\n"
+	@if [ -z "$(PYTHON_EXE)" ]; then \
+		echo "ERROR: Python 3 not found in PATH."; \
+		exit 1; \
+	fi
+	@# Verify the version of the found python executable using its exit code
+	@if ! $(PYTHON_EXE) -c "import sys; sys.exit(0 if sys.version_info[:2] == ($(shell echo $(PYTHON_VERSION) | cut -d. -f1), $(shell echo $(PYTHON_VERSION) | cut -d. -f2)) else 1)"; then \
+		echo "ERROR: Found Python at '$(PYTHON_EXE)' but it is not version $(PYTHON_VERSION)."; \
+		echo "Please ensure 'python$(PYTHON_VERSION)' or a compatible 'python3' is in your PATH."; \
+		exit 1; \
+	fi
+	@echo "Python executable: $(PYTHON_EXE) ($$($(PYTHON_EXE) --version))"
+	@echo "Python CFLAGS:     $(PYTHON_CFLAGS)"
+	@echo "Python LDFLAGS:    $(PYTHON_LDFLAGS)"
+	@if [ -z "$(PYTHON_CFLAGS)" ]; then \
+		echo "ERROR: Python development headers not found. See installation instructions above."; \
+		exit 1; \
+	fi
+	@printf "\033[33;1m==============================\033[0m\n"
+
+.PHONY: install-python-deps
+install-python-deps: detect-python ## Sets up the Python virtual environment and installs dependencies.
+	@printf "\033[33;1m==== Setting up Python virtual environment in $(VENV_DIR) ====\033[0m\n"
+	@if [ ! -f "$(VENV_BIN)/pip" ]; then \
+		echo "Creating virtual environment..."; \
+		$(PYTHON_EXE) -m venv $(VENV_DIR) || { \
+			echo "ERROR: Failed to create virtual environment."; \
+			echo "Your Python installation may be missing the 'venv' module."; \
+			echo "Try: 'sudo apt install python$(PYTHON_VERSION)-venv' or 'sudo dnf install python$(PYTHON_VERSION)-devel'"; \
+			exit 1; \
+		}; \
+		mkdir -p $(VENV_SRC); \
+	fi
+	@echo "Upgrading pip and installing dependencies..."
+	@$(VENV_BIN)/pip install --upgrade pip
+	cp $(KV_CACHE_MGR_PATH)/requirements.txt $(VENV_SRC)/
+	cp $(KV_CACHE_MGR_PATH)/render_jinja_template_wrapper.py $(VENV_SRC)/
+	chmod u+w $(VENV_SRC)/*
+	@$(VENV_BIN)/pip install -r $(VENV_SRC)/requirements.txt
+	@echo "Verifying transformers installation..."
+	@$(VENV_BIN)/python -c "import transformers; print('✅ Transformers version ' + transformers.__version__ + ' installed.')" || { \
+		echo "ERROR: transformers library not properly installed in venv."; \
+		exit 1; \
+	}
+
 ##@ ZMQ Setup
 
-.PHONY: install-dependencies
-install-dependencies: download-tokenizer ## Install development dependencies based on OS/ARCH
-	@echo "Checking and installing development dependencies..."
-	@if [ "$(TARGETOS)" = "linux" ]; then \
-	  if [ -x "$$(command -v apt)" ]; then \
-	    if ! dpkg -s libzmq3-dev >/dev/null 2>&1 || ! dpkg -s g++ >/dev/null 2>&1; then \
-	      echo "Installing dependencies with apt..."; \
-	      sudo apt-get update && sudo apt-get install -y libzmq3-dev g++; \
-	    else \
-	      echo "✅ ZMQ and g++ are already installed."; \
-	    fi; \
-	  elif [ -x "$$(command -v dnf)" ]; then \
-	    if ! dnf -q list installed zeromq-devel >/dev/null 2>&1 || ! dnf -q list installed gcc-c++ >/dev/null 2>&1; then \
-	      echo "Installing dependencies with dnf..."; \
-	      sudo dnf install -y zeromq-devel gcc-c++; \
+.PHONY: download-zmq
+download-zmq: ## Install ZMQ dependencies based on OS/ARCH
+	@echo "Checking if ZMQ is already installed..."
+	@if pkg-config --exists libzmq; then \
+	  echo "✅ ZMQ is already installed."; \
+	else \
+	  echo "Installing ZMQ dependencies..."; \
+	  if [ "$(TARGETOS)" = "linux" ]; then \
+	    if [ -x "$$(command -v apt)" ]; then \
+	      apt update && apt install -y libzmq3-dev; \
+	    elif [ -x "$$(command -v dnf)" ]; then \
+	      dnf install -y zeromq-devel; \
 	    else \
-	      echo "✅ ZMQ and gcc-c++ are already installed."; \
+	      echo "Unsupported Linux package manager. Install libzmq manually."; \
+	      exit 1; \
 	    fi; \
-	  else \
-	    echo "Unsupported Linux package manager. Install libzmq and g++/gcc-c++ manually."; \
-	    exit 1; \
-	  fi; \
-	elif [ "$(TARGETOS)" = "darwin" ]; then \
-	  if [ -x "$$(command -v brew)" ]; then \
-	    if ! brew list zeromq pkg-config >/dev/null 2>&1; then \
-	      echo "Installing dependencies with brew..."; \
-	      brew install zeromq pkg-config; \
+	  elif [ "$(TARGETOS)" = "darwin" ]; then \
+	    if [ -x "$$(command -v brew)" ]; then \
+	      brew install zeromq; \
 	    else \
-	      echo "✅ ZeroMQ and pkgconf are already installed."; \
+	      echo "Homebrew is not installed and is required to install zeromq. Install it from https://brew.sh/"; \
+	      exit 1; \
 	    fi; \
 	  else \
-	    echo "Homebrew is not installed and is required to install zeromq. Install it from https://brew.sh/"; \
+	    echo "Unsupported OS: $(TARGETOS). Install libzmq manually - check https://zeromq.org/download/ for guidance."; \
 	    exit 1; \
 	  fi; \
-	else \
-	  echo "Unsupported OS: $(TARGETOS). Install development dependencies manually."; \
-	  exit 1; \
+	  echo "✅ ZMQ dependencies installed."; \
 	fi
diff --git a/pkg/kv-cache/kv_cache.go b/pkg/kv-cache/kv_cache.go
@@ -35,7 +35,7 @@ type KVCacheHelper struct {
 	blockSize       int
 }
 
-func NewKVCacheHelper(ctx context.Context, config *common.Configuration, logger logr.Logger, usageChan chan float64,
+func NewKVCacheHelper(config *common.Configuration, logger logr.Logger, usageChan chan float64,
 	tokenizer tokenization.Tokenizer) (*KVCacheHelper, error) {
 	tokenProcConfig := kvblock.DefaultTokenProcessorConfig()
 	tokenProcConfig.BlockSize = config.TokenBlockSize
@@ -71,6 +71,8 @@ func (h *KVCacheHelper) Activate() {
 	h.blockCache.activate()
 }
 
+// OnRequestStart called when request received, simulates KV-cache block management
+// Returns number of tokens found in the cache.
 func (h *KVCacheHelper) OnRequestStart(prompt, modelName, requestID string) (int, error) {
 	h.logger.V(logging.TRACE).Info("KV cache - process request")
 
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -380,7 +380,7 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error {
 		}
 		s.logger.V(logging.DEBUG).Info("Chat template loaded", "template", s.chatTemplate, "params", s.chatTemplateKWArgs)
 
-		s.kvcacheHelper, err = kvcache.NewKVCacheHelper(ctx, s.config, s.logger, s.metrics.kvCacheUsageChan, s.tokenizer)
+		s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.config, s.logger, s.metrics.kvCacheUsageChan, s.tokenizer)
 		if err != nil {
 			return err
 		}
@@ -790,3 +790,37 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx {
 
 	return nil
 }
+
+func (s *VllmSimulator) getPromptForKVCache(reqCtx *openaiserverapi.CompletionReqCtx) (string, error) {
+	if reqCtx.IsChatCompletion {
+		renderReq := preprocessing.RenderJinjaTemplateRequest{
+			Conversations:             make([]preprocessing.ChatMessage, 0),
+			Tools:                     make([]interface{}, 0),
+			Documents:                 make([]interface{}, 0),
+			ReturnAssistantTokensMask: false,
+			ContinueFinalMessage:      false,
+			AddGenerationPrompt:       false,
+			ChatTemplate:              s.chatTemplate,
+			ChatTemplateKWArgs:        s.chatTemplateKWArgs,
+		}
+		// Convert messages to the format expected by the renderer
+		for _, msg := range reqCtx.CompletionReq.GetMessages() {
+			renderReq.Conversations = append(renderReq.Conversations, preprocessing.ChatMessage{
+				Role:    msg.Role,
+				Content: msg.Content.Raw,
+			})
+		}
+
+		// Don't use vllmReq.GetModel() - it may include LoRA's name.
+		// This call requires the base model name instead.
+		prompt, err := s.tokenizer.RenderChatTemplate(s.config.Model, &renderReq)
+		if err != nil {
+			s.logger.Error(err, "failed to render template")
+			return "", err
+		}
+		s.logger.Info("Convert prompt", "rendered", prompt)
+		return prompt, nil
+	}
+
+	return reqCtx.CompletionReq.GetPrompt(), nil
+}
diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go
@@ -26,7 +26,6 @@ import (
 	"github.com/llm-d/llm-d-inference-sim/pkg/common"
 	"github.com/llm-d/llm-d-inference-sim/pkg/common/logging"
 	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
-	preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions"
 	"github.com/valyala/fasthttp"
 )
 
@@ -91,40 +90,14 @@ func (s *VllmSimulator) processRequestAsync(reqCtx *openaiserverapi.CompletionRe
 	}
 
 	if s.config.EnableKVCache {
-		var prompt string
+		prompt, err := s.getPromptForKVCache(reqCtx)
 
-		if reqCtx.IsChatCompletion {
-			renderReq := preprocessing.RenderJinjaTemplateRequest{
-				Conversations:             make([]preprocessing.ChatMessage, 0),
-				Tools:                     make([]interface{}, 0),
-				Documents:                 make([]interface{}, 0),
-				ReturnAssistantTokensMask: false,
-				ContinueFinalMessage:      false,
-				AddGenerationPrompt:       false,
-				ChatTemplate:              s.chatTemplate,
-				ChatTemplateKWArgs:        s.chatTemplateKWArgs,
-			}
-			// Convert messages to the format expected by the renderer
-			for _, msg := range req.GetMessages() {
-				renderReq.Conversations = append(renderReq.Conversations, preprocessing.ChatMessage{
-					Role:    msg.Role,
-					Content: msg.Content.Raw,
-				})
-			}
-
-			var err error
-			// Don't use vllmReq.GetModel() - it may include LoRA's name.
-			// This call requires the base model name instead.
-			prompt, err = s.tokenizer.RenderChatTemplate(s.config.Model, &renderReq)
-			if err != nil {
-				s.logger.Error(err, "failed to render template")
-				s.sendCompletionError(reqCtx.HTTPReqCtx,
-					openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil),
-					false)
-				return
-			}
-		} else {
-			prompt = req.GetPrompt()
+		if err != nil {
+			s.logger.Error(err, "failed to render template")
+			s.sendCompletionError(reqCtx.HTTPReqCtx,
+				openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil),
+				false)
+			return
 		}
 
 		if numOfCachedTokens, err := s.kvcacheHelper.OnRequestStart(prompt, req.GetModel(), req.GetRequestID()); err != nil {

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ type KVCacheHelper struct {`
`35`	`35`	`blockSize int`
`36`	`36`	`}`
`37`	`37`
`38`		`-func NewKVCacheHelper(ctx context.Context, config *common.Configuration, logger logr.Logger, usageChan chan float64,`
	`38`	`+func NewKVCacheHelper(config *common.Configuration, logger logr.Logger, usageChan chan float64,`
`39`	`39`	`tokenizer tokenization.Tokenizer) (*KVCacheHelper, error) {`
`40`	`40`	`tokenProcConfig := kvblock.DefaultTokenProcessorConfig()`
`41`	`41`	`tokenProcConfig.BlockSize = config.TokenBlockSize`
`@@ -71,6 +71,8 @@ func (h *KVCacheHelper) Activate() {`
`71`	`71`	`h.blockCache.activate()`
`72`	`72`	`}`
`73`	`73`
	`74`	`+// OnRequestStart called when request received, simulates KV-cache block management`
	`75`	`+// Returns number of tokens found in the cache.`
`74`	`76`	`func (h *KVCacheHelper) OnRequestStart(prompt, modelName, requestID string) (int, error) {`
`75`	`77`	`h.logger.V(logging.TRACE).Info("KV cache - process request")`
`76`	`78`