diff --git a/.github/workflows/ci-pr-checks.yaml b/.github/workflows/ci-pr-checks.yaml index 12f9d82..ef07f3f 100644 --- a/.github/workflows/ci-pr-checks.yaml +++ b/.github/workflows/ci-pr-checks.yaml @@ -25,7 +25,9 @@ jobs: run: | sudo apt-get update sudo apt-get install -y pkg-config python3-dev python3-pip - make install-dependencies + make download-tokenizer + make download-zmq + make install-python-deps pip3 install transformers --break-system-packages - name: Run lint checks diff --git a/.gitignore b/.gitignore index d684889..8f8339b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ pkg/dataset/.llm-d pkg/llm-d-inference-sim/tests-tmp/ pkg/llm-d-inference-sim/.llm-d/ .llm-d/ +.venv \ No newline at end of file diff --git a/Makefile b/Makefile index 6b3625d..aec21ec 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,8 @@ IMAGE_REGISTRY ?= ghcr.io/llm-d IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME) SIM_TAG ?= dev IMG = $(IMAGE_TAG_BASE):$(SIM_TAG) +POD_IP ?= pod +export POD_IP ifeq ($(TARGETOS),darwin) ifeq ($(TARGETARCH),amd64) @@ -60,20 +62,61 @@ export PKG_CONFIG_PATH=/usr/lib/pkgconfig PYTHON_VERSION := 3.12 # Unified Python configuration detection. This block runs once. -PYTHON_CONFIG ?= $(shell command -v python$(PYTHON_VERSION)-config || command -v python3-config) +# It prioritizes python-config, then pkg-config, for reliability. +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + # macOS: Find Homebrew's python-config script for the most reliable flags. + BREW_PREFIX := $(shell command -v brew >/dev/null 2>&1 && brew --prefix python@$(PYTHON_VERSION) 2>/dev/null) + PYTHON_CONFIG := $(BREW_PREFIX)/bin/python$(PYTHON_VERSION)-config + ifneq ($(shell $(PYTHON_CONFIG) --cflags 2>/dev/null),) + PYTHON_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags) + # Use --ldflags --embed to get all necessary flags for linking + PYTHON_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed) + PYTHON_LIBS := + else + $(error "Could not execute 'python$(PYTHON_VERSION)-config' from Homebrew. Please ensure Python is installed correctly with: 'brew install python@$(PYTHON_VERSION)'") + endif +else ifeq ($(UNAME_S),Linux) + # Linux: Use standard system tools to find flags. + PYTHON_CONFIG := $(shell command -v python$(PYTHON_VERSION)-config || command -v python3-config) + ifneq ($(shell $(PYTHON_CONFIG) --cflags 2>/dev/null),) + # Use python-config if available and correct + PYTHON_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags) + PYTHON_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed) + PYTHON_LIBS := + else ifneq ($(shell pkg-config --cflags python-$(PYTHON_VERSION) 2>/dev/null),) + # Fallback to pkg-config + PYTHON_CFLAGS := $(shell pkg-config --cflags python-$(PYTHON_VERSION)) + PYTHON_LDFLAGS := $(shell pkg-config --libs python-$(PYTHON_VERSION)) + PYTHON_LIBS := + else + $(error "Python $(PYTHON_VERSION) development headers not found. Please install with: 'sudo apt install python$(PYTHON_VERSION)-dev' or 'sudo dnf install python$(PYTHON_VERSION)-devel'") + endif +else + $(error "Unsupported OS: $(UNAME_S)") +endif + +# Final CGO flags with all dependencies +CGO_CFLAGS_FINAL := $(PYTHON_CFLAGS) -Ilib +CGO_LDFLAGS_FINAL := $(PYTHON_LDFLAGS) $(PYTHON_LIBS) -Llib -ltokenizers -ldl -lm + +VENV_DIR ?= $(shell pwd)/.venv +VENV_BIN := $(VENV_DIR)/bin +VENV_SRC := $(VENV_DIR)/python -CGO_CFLAGS := $(shell $(PYTHON_CONFIG) --cflags --embed) -CGO_LDFLAGS := $(shell $(PYTHON_CONFIG) --ldflags --embed) +PYTHON_EXE := $(shell command -v python$(PYTHON_VERSION) || command -v python3) GOMODCACHE := $(shell go env GOMODCACHE) KV_CACHE_MGR_VERSION := $(shell go list -m -f '{{.Version}}' github.com/llm-d/llm-d-kv-cache-manager) KV_CACHE_MGR_PATH := $(GOMODCACHE)/github.com/llm-d/llm-d-kv-cache-manager@$(KV_CACHE_MGR_VERSION)/pkg/preprocessing/chat_completions -export PYTHONPATH := $(KV_CACHE_MGR_PATH):$(PYTHONPATH) -CPATH := $(PYTHON_INCLUDE):$(CPATH) +# Common environment variables for Go tests and builds +export CGO_ENABLED=1 +export CGO_CFLAGS=$(CGO_CFLAGS_FINAL) +export CGO_LDFLAGS=$(CGO_LDFLAGS_FINAL) +export PYTHONPATH=$(VENV_SRC):$(VENV_DIR)/lib/python$(PYTHON_VERSION)/site-packages GO_LDFLAGS := -extldflags '-L$(shell pwd)/lib $(LDFLAGS) $(CGO_LDFLAGS)' -CGO_ENABLED=1 TOKENIZER_LIB = lib/libtokenizers.a # Extract TOKENIZER_VERSION from Dockerfile TOKENIZER_VERSION := $(shell grep '^ARG TOKENIZER_VERSION=' Dockerfile | cut -d'=' -f2) @@ -84,7 +127,11 @@ $(TOKENIZER_LIB): ## Download the HuggingFace tokenizer bindings. @echo "Downloading HuggingFace tokenizer bindings for version $(TOKENIZER_VERSION)..." mkdir -p lib - curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TOKENIZER_ARCH).tar.gz | tar -xz -C lib + if [ "$(TARGETOS)" = "darwin" ] && [ "$(TARGETARCH)" = "amd64" ]; then \ + curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-x86_64.tar.gz | tar -xz -C lib; \ + else \ + curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib; \ + fi ranlib lib/*.a ##@ Development @@ -101,13 +148,18 @@ format: ## Format Go source files @gofmt -l -w $(SRC) .PHONY: test -test: $(GINKGO) install-dependencies ## Run tests - @printf "\033[33;1m==== Running tests ====\033[0m\n" -ifdef GINKGO_FOCUS - CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r -- -ginkgo.v -ginkgo.focus="$(GINKGO_FOCUS)" -else - CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" $(GINKGO) -ldflags="$(GO_LDFLAGS)" -v -r $(TEST_PKG) -endif +test: download-tokenizer install-python-deps ## Run unit tests + @printf "\033[33;1m==== Running unit tests ====\033[0m\n" + if [ -n "$(GINKGO_FOCUS)" ] && [ -z "$(GINKGO_FOCUS_PKG)" ]; then \ + echo "Error: GINKGO_FOCUS is defined without GINKGO_FOCUS_PKG. Both required or neither."; \ + exit 1; \ + elif [ -n "$(GINKGO_FOCUS)$(GINKGO_FOCUS_PKG)" ]; then \ + echo "Running specific tests"; \ + go test -v $(GINKGO_FOCUS_PKG) $(if $(GINKGO_FOCUS),-ginkgo.focus="$(GINKGO_FOCUS)",); \ + else \ + echo "Running all tests"; \ + go test -v ./pkg/...; \ + fi .PHONY: post-deploy-test post-deploy-test: ## Run post deployment tests @@ -122,10 +174,15 @@ lint: $(GOLANGCI_LINT) ## Run lint ##@ Build .PHONY: build -build: check-go install-dependencies +build: check-go download-tokenizer install-python-deps download-zmq @printf "\033[33;1m==== Building ====\033[0m\n" CGO_CFLAGS="$(CGO_CFLAGS)" go build -ldflags="$(GO_LDFLAGS)" -o $(LOCALBIN)/$(PROJECT_NAME) cmd/$(PROJECT_NAME)/main.go +.PHONY: run +run: install-python-deps # build ## Run the application locally + @printf "\033[33;1m==== Running application ====\033[0m\n" + . $(VENV_DIR)/bin/activate && ./bin/$(PROJECT_NAME) $(ARGS) + ##@ Container Build/Push .PHONY: image-build @@ -234,43 +291,81 @@ print-project-name: ## Print the current project name install-hooks: ## Install git hooks git config core.hooksPath hooks +.PHONY: detect-python +detect-python: ## Detects Python and prints the configuration. + @printf "\033[33;1m==== Python Configuration ====\033[0m\n" + @if [ -z "$(PYTHON_EXE)" ]; then \ + echo "ERROR: Python 3 not found in PATH."; \ + exit 1; \ + fi + @# Verify the version of the found python executable using its exit code + @if ! $(PYTHON_EXE) -c "import sys; sys.exit(0 if sys.version_info[:2] == ($(shell echo $(PYTHON_VERSION) | cut -d. -f1), $(shell echo $(PYTHON_VERSION) | cut -d. -f2)) else 1)"; then \ + echo "ERROR: Found Python at '$(PYTHON_EXE)' but it is not version $(PYTHON_VERSION)."; \ + echo "Please ensure 'python$(PYTHON_VERSION)' or a compatible 'python3' is in your PATH."; \ + exit 1; \ + fi + @echo "Python executable: $(PYTHON_EXE) ($$($(PYTHON_EXE) --version))" + @echo "Python CFLAGS: $(PYTHON_CFLAGS)" + @echo "Python LDFLAGS: $(PYTHON_LDFLAGS)" + @if [ -z "$(PYTHON_CFLAGS)" ]; then \ + echo "ERROR: Python development headers not found. See installation instructions above."; \ + exit 1; \ + fi + @printf "\033[33;1m==============================\033[0m\n" + +.PHONY: install-python-deps +install-python-deps: detect-python ## Sets up the Python virtual environment and installs dependencies. + @printf "\033[33;1m==== Setting up Python virtual environment in $(VENV_DIR) ====\033[0m\n" + @if [ ! -f "$(VENV_BIN)/pip" ]; then \ + echo "Creating virtual environment..."; \ + $(PYTHON_EXE) -m venv $(VENV_DIR) || { \ + echo "ERROR: Failed to create virtual environment."; \ + echo "Your Python installation may be missing the 'venv' module."; \ + echo "Try: 'sudo apt install python$(PYTHON_VERSION)-venv' or 'sudo dnf install python$(PYTHON_VERSION)-devel'"; \ + exit 1; \ + }; \ + mkdir -p $(VENV_SRC); \ + fi + @echo "Upgrading pip and installing dependencies..." + @$(VENV_BIN)/pip install --upgrade pip + cp $(KV_CACHE_MGR_PATH)/requirements.txt $(VENV_SRC)/ + cp $(KV_CACHE_MGR_PATH)/render_jinja_template_wrapper.py $(VENV_SRC)/ + chmod u+w $(VENV_SRC)/* + @$(VENV_BIN)/pip install -r $(VENV_SRC)/requirements.txt + @echo "Verifying transformers installation..." + @$(VENV_BIN)/python -c "import transformers; print('✅ Transformers version ' + transformers.__version__ + ' installed.')" || { \ + echo "ERROR: transformers library not properly installed in venv."; \ + exit 1; \ + } + ##@ ZMQ Setup -.PHONY: install-dependencies -install-dependencies: download-tokenizer ## Install development dependencies based on OS/ARCH - @echo "Checking and installing development dependencies..." - @if [ "$(TARGETOS)" = "linux" ]; then \ - if [ -x "$$(command -v apt)" ]; then \ - if ! dpkg -s libzmq3-dev >/dev/null 2>&1 || ! dpkg -s g++ >/dev/null 2>&1; then \ - echo "Installing dependencies with apt..."; \ - sudo apt-get update && sudo apt-get install -y libzmq3-dev g++; \ - else \ - echo "✅ ZMQ and g++ are already installed."; \ - fi; \ - elif [ -x "$$(command -v dnf)" ]; then \ - if ! dnf -q list installed zeromq-devel >/dev/null 2>&1 || ! dnf -q list installed gcc-c++ >/dev/null 2>&1; then \ - echo "Installing dependencies with dnf..."; \ - sudo dnf install -y zeromq-devel gcc-c++; \ +.PHONY: download-zmq +download-zmq: ## Install ZMQ dependencies based on OS/ARCH + @echo "Checking if ZMQ is already installed..." + @if pkg-config --exists libzmq; then \ + echo "✅ ZMQ is already installed."; \ + else \ + echo "Installing ZMQ dependencies..."; \ + if [ "$(TARGETOS)" = "linux" ]; then \ + if [ -x "$$(command -v apt)" ]; then \ + apt update && apt install -y libzmq3-dev; \ + elif [ -x "$$(command -v dnf)" ]; then \ + dnf install -y zeromq-devel; \ else \ - echo "✅ ZMQ and gcc-c++ are already installed."; \ + echo "Unsupported Linux package manager. Install libzmq manually."; \ + exit 1; \ fi; \ - else \ - echo "Unsupported Linux package manager. Install libzmq and g++/gcc-c++ manually."; \ - exit 1; \ - fi; \ - elif [ "$(TARGETOS)" = "darwin" ]; then \ - if [ -x "$$(command -v brew)" ]; then \ - if ! brew list zeromq pkg-config >/dev/null 2>&1; then \ - echo "Installing dependencies with brew..."; \ - brew install zeromq pkg-config; \ + elif [ "$(TARGETOS)" = "darwin" ]; then \ + if [ -x "$$(command -v brew)" ]; then \ + brew install zeromq; \ else \ - echo "✅ ZeroMQ and pkgconf are already installed."; \ + echo "Homebrew is not installed and is required to install zeromq. Install it from https://brew.sh/"; \ + exit 1; \ fi; \ else \ - echo "Homebrew is not installed and is required to install zeromq. Install it from https://brew.sh/"; \ + echo "Unsupported OS: $(TARGETOS). Install libzmq manually - check https://zeromq.org/download/ for guidance."; \ exit 1; \ fi; \ - else \ - echo "Unsupported OS: $(TARGETOS). Install development dependencies manually."; \ - exit 1; \ + echo "✅ ZMQ dependencies installed."; \ fi diff --git a/README.md b/README.md index 610eefd..a837943 100644 --- a/README.md +++ b/README.md @@ -332,7 +332,7 @@ make build ### Running To run the vLLM simulator in a standalone test environment, run: ```bash -./bin/llm-d-inference-sim --model my_model --port 8000 +make run ARGS='--model="Qwen/Qwen2.5-1.5B-Instruct" --port 8000 -v=4' ``` ## Kubernetes testing diff --git a/pkg/kv-cache/kv_cache.go b/pkg/kv-cache/kv_cache.go index a87c7b7..ae6da91 100644 --- a/pkg/kv-cache/kv_cache.go +++ b/pkg/kv-cache/kv_cache.go @@ -23,7 +23,6 @@ import ( "github.com/go-logr/logr" "github.com/llm-d/llm-d-inference-sim/pkg/common" "github.com/llm-d/llm-d-inference-sim/pkg/common/logging" - openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api" "github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock" "github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization" ) @@ -49,6 +48,7 @@ func NewKVCacheHelper(config *common.Configuration, logger logr.Logger, usageCha if err != nil { return nil, fmt.Errorf("failed to create block cache: %w", err) } + return &KVCacheHelper{ tokenizer: tokenizer, tokensProcessor: tokensProcessor, @@ -71,18 +71,16 @@ func (h *KVCacheHelper) Activate() { h.blockCache.activate() } -func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest) error { +// OnRequestStart called when request received, simulates KV-cache block management +// Returns number of tokens found in the cache. +func (h *KVCacheHelper) OnRequestStart(prompt, modelName, requestID string) (int, error) { h.logger.V(logging.TRACE).Info("KV cache - process request") - prompt := vllmReq.GetPrompt() - modelName := vllmReq.GetModel() - requestID := vllmReq.GetRequestID() - // tokenize the input tokens, _, err := h.tokenizer.Encode(prompt, modelName) if err != nil { h.logger.Error(err, "prompt tokenization failed") - return err + return 0, err } // get block keys @@ -95,8 +93,7 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest } nBlocksAlreadyInCache, err := h.blockCache.startRequest(requestID, blockHashes) - vllmReq.SetNumberOfCachedPromptTokens(nBlocksAlreadyInCache * h.blockSize) - return err + return nBlocksAlreadyInCache * h.blockSize, err } func (h *KVCacheHelper) OnRequestEnd(requestID string) error { diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index f097853..ab650b3 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -39,6 +39,7 @@ import ( kvcache "github.com/llm-d/llm-d-inference-sim/pkg/kv-cache" openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api" vllmapi "github.com/llm-d/llm-d-inference-sim/pkg/vllm-api" + preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions" "github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization" ) @@ -53,6 +54,7 @@ const ( requestIDHeader = "X-Request-Id" podNameEnv = "POD_NAME" podNsEnv = "POD_NAMESPACE" + envHFToken = "HF_TOKEN" ) type loraUsageState int @@ -212,6 +214,11 @@ type VllmSimulator struct { queueCapacity int // a channel for incoming requests newRequests chan *openaiserverapi.CompletionReqCtx + + // chat template for converting /chat/completions request content to a plain string + chatTemplate string + // parameters for the chat template + chatTemplateKWArgs map[string]interface{} } // New creates a new VllmSimulator instance with the given logger @@ -355,6 +362,24 @@ func (s *VllmSimulator) initializeSim(ctx context.Context) error { } if s.config.EnableKVCache { + // initialize chat template, only in case kv cache enabled + templateReq := preprocessing.FetchChatTemplateRequest{ + Model: s.config.Model, + Token: os.Getenv(envHFToken), + } + + chatTemplatingProcessor := preprocessing.NewChatTemplatingProcessor() + if err := chatTemplatingProcessor.Initialize(); err != nil { + return fmt.Errorf("failed to initialize chat-templating processor: %w", err) + } + + s.chatTemplate, s.chatTemplateKWArgs, err = chatTemplatingProcessor.FetchChatTemplate(ctx, templateReq) + if err != nil { + s.logger.Error(err, "failed to get chat template") + return err + } + s.logger.V(logging.DEBUG).Info("Chat template loaded", "template", s.chatTemplate, "params", s.chatTemplateKWArgs) + s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.config, s.logger, s.metrics.kvCacheUsageChan, s.tokenizer) if err != nil { return err @@ -765,3 +790,37 @@ func (s *VllmSimulator) dequeue() *openaiserverapi.CompletionReqCtx { return nil } + +func (s *VllmSimulator) getPromptForKVCache(reqCtx *openaiserverapi.CompletionReqCtx) (string, error) { + if reqCtx.IsChatCompletion { + renderReq := preprocessing.RenderJinjaTemplateRequest{ + Conversations: make([]preprocessing.ChatMessage, 0), + Tools: make([]interface{}, 0), + Documents: make([]interface{}, 0), + ReturnAssistantTokensMask: false, + ContinueFinalMessage: false, + AddGenerationPrompt: false, + ChatTemplate: s.chatTemplate, + ChatTemplateKWArgs: s.chatTemplateKWArgs, + } + // Convert messages to the format expected by the renderer + for _, msg := range reqCtx.CompletionReq.GetMessages() { + renderReq.Conversations = append(renderReq.Conversations, preprocessing.ChatMessage{ + Role: msg.Role, + Content: msg.Content.Raw, + }) + } + + // Don't use vllmReq.GetModel() - it may include LoRA's name. + // This call requires the base model name instead. + prompt, err := s.tokenizer.RenderChatTemplate(s.config.Model, &renderReq) + if err != nil { + s.logger.Error(err, "failed to render template") + return "", err + } + s.logger.Info("Convert prompt", "rendered", prompt) + return prompt, nil + } + + return reqCtx.CompletionReq.GetPrompt(), nil +} diff --git a/pkg/llm-d-inference-sim/worker.go b/pkg/llm-d-inference-sim/worker.go index 481d185..5a248db 100644 --- a/pkg/llm-d-inference-sim/worker.go +++ b/pkg/llm-d-inference-sim/worker.go @@ -89,12 +89,23 @@ func (s *VllmSimulator) processRequestAsync(reqCtx *openaiserverapi.CompletionRe "metrics.lorasChan") } - if s.config.EnableKVCache && !reqCtx.IsChatCompletion { - // kv cache is currently supported for /completion API only - if err := s.kvcacheHelper.OnRequestStart(req); err != nil { + if s.config.EnableKVCache { + prompt, err := s.getPromptForKVCache(reqCtx) + + if err != nil { + s.logger.Error(err, "failed to render template") + s.sendCompletionError(reqCtx.HTTPReqCtx, + openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil), + false) + return + } + + if numOfCachedTokens, err := s.kvcacheHelper.OnRequestStart(prompt, req.GetModel(), req.GetRequestID()); err != nil { s.sendCompletionError(reqCtx.HTTPReqCtx, openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil), false) + } else { + req.SetNumberOfCachedPromptTokens(numOfCachedTokens) } } diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go index 80bc4cf..ed06556 100644 --- a/pkg/openai-server-api/request.go +++ b/pkg/openai-server-api/request.go @@ -50,6 +50,8 @@ type CompletionRequest interface { SetNumberOfCachedPromptTokens(cachedPromptTokens int) // GetPrompt returns the prompt GetPrompt() string + // GetMessages returns request messages (in chat completion) + GetMessages() []Message // GetTools returns tools to use (in chat completion) GetTools() []Tool // GetToolChoice returns tool choice (in chat completion) @@ -243,6 +245,10 @@ func (c *ChatCompletionRequest) GetPrompt() string { return messages } +func (c *ChatCompletionRequest) GetMessages() []Message { + return c.Messages +} + func (c *ChatCompletionRequest) GetTools() []Tool { return c.Tools } @@ -335,6 +341,10 @@ func (t *TextCompletionRequest) GetPrompt() string { return t.Prompt } +func (c *TextCompletionRequest) GetMessages() []Message { + return make([]Message, 0) +} + func (c *TextCompletionRequest) GetTools() []Tool { return nil }