diff --git a/.gitignore b/.gitignore index 4458cf8..e0cc71b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,7 @@ runpod.toml .env test/* vllm-base/vllm-* -.DS_Store \ No newline at end of file +.DS_Store +build/ +*.lock +*.egg-info \ No newline at end of file diff --git a/Dockerfile.custom b/Dockerfile.custom new file mode 100644 index 0000000..c0793a4 --- /dev/null +++ b/Dockerfile.custom @@ -0,0 +1,52 @@ +FROM nvidia/cuda:12.1.0-base-ubuntu22.04 + +RUN apt-get update -y \ + && apt-get install -y python3-pip + +RUN ldconfig /usr/local/cuda-12.1/compat/ + +# Install Python dependencies +COPY builder/requirements.txt /requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install --upgrade -r /requirements.txt + +# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer +RUN python3 -m pip install vllm==0.10.0 && \ + python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 + +RUN pip install --extra-index-url https://miropsota.github.io/torch_packages_builder flash-attn==2.8.3+pt2.7.0cu126 + +# Setup for Option 2: Building the Image with the Model included +ARG MODEL_NAME="" +ARG TOKENIZER_NAME="" +ARG BASE_PATH="/runpod-volume" +ARG QUANTIZATION="" +ARG MODEL_REVISION="" +ARG TOKENIZER_REVISION="" + +ENV MODEL_NAME=$MODEL_NAME \ + MODEL_REVISION=$MODEL_REVISION \ + TOKENIZER_NAME=$TOKENIZER_NAME \ + TOKENIZER_REVISION=$TOKENIZER_REVISION \ + BASE_PATH=$BASE_PATH \ + QUANTIZATION=$QUANTIZATION \ + HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \ + HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \ + HF_HOME="${BASE_PATH}/huggingface-cache/hub" \ + HF_HUB_ENABLE_HF_TRANSFER=0 + +ENV PYTHONPATH="/:/vllm-workspace" + + +COPY src /src +RUN --mount=type=secret,id=HF_TOKEN,required=false \ + if [ -f /run/secrets/HF_TOKEN ]; then \ + export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \ + fi && \ + if [ -n "$MODEL_NAME" ]; then \ + python3 /src/download_model.py; \ + fi + +# Start the handler +CMD ["python3", "/src/handler_custom.py"] diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..afaf360 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0.0 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8f876db --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,69 @@ +[build-system] +requires = ["setuptools>=77.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "vllm-worker" +dynamic = ["version"] +description = "OpenAI-compatible vLLM worker for serverless inference. Forked from https://github.com/runpod-workers/worker-vllm" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +authors = [ + {name = "Arief Wijaya", email = "ariefwiijaya@gmail.com"} +] +keywords = ["vllm", "llm", "inference", "openai", "serverless"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] + +dependencies = [ + "ray", + "pandas", + "pyarrow", + "runpod~=1.7.7", + "huggingface-hub", + "packaging", + "typing-extensions>=4.8.0", + "pydantic", + "pydantic-settings", + "hf-transfer", + "transformers>=4.51.3", + "bitsandbytes>=0.45.0", + "kernels", + "torch==2.6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "black", + "flake8", + "mypy", +] + +[project.urls] +Homepage = "https://github.com/ariefwijaya/worker-vllm" +Repository = "https://github.com/ariefwijaya/worker-vllm" +Documentation = "https://github.com/ariefwijaya/worker-vllm/blob/main/README.md" + +[tool.setuptools] +package-dir = {"vllm_worker" = "src"} +packages = ["vllm_worker"] + +[tool.setuptools.dynamic] +version = {file = "VERSION"} + +[tool.black] +line-length = 100 +target-version = ['py310', 'py311', 'py312'] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" diff --git a/src/__init__.py b/src/__init__.py index e69de29..dc9e1ef 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1,37 @@ +""" +vLLM Worker - OpenAI-compatible vLLM inference engine + +Usage: + from vllm_worker import vLLMEngine, OpenAIvLLMEngine, JobInput +""" + +try: + from importlib.metadata import version, PackageNotFoundError +except ImportError: + # Python < 3.8 + from importlib_metadata import version, PackageNotFoundError + +try: + __version__ = version("vllm-worker") +except PackageNotFoundError: + # Package is not installed, fallback to reading VERSION file + from pathlib import Path + _version_file = Path(__file__).parent.parent / "VERSION" + __version__ = _version_file.read_text().strip() + +# Import main classes for easy access +from .engine import vLLMEngine, OpenAIvLLMEngine +from .utils import JobInput, DummyRequest, BatchSize, create_error_response +from .tokenizer import TokenizerWrapper +from .engine_args import get_engine_args + +__all__ = [ + "vLLMEngine", + "OpenAIvLLMEngine", + "JobInput", + "DummyRequest", + "BatchSize", + "TokenizerWrapper", + "get_engine_args", + "create_error_response", +] diff --git a/src/download_model.py b/src/download_model.py index 107e1e5..3cc16be 100644 --- a/src/download_model.py +++ b/src/download_model.py @@ -4,7 +4,13 @@ import glob from shutil import rmtree from huggingface_hub import snapshot_download -from utils import timer_decorator + +try: + # Try relative imports (when installed as package) + from .utils import timer_decorator +except ImportError: + # Fall back to absolute imports (when running directly) + from utils import timer_decorator BASE_DIR = "/" TOKENIZER_PATTERNS = [["*.json", "tokenizer*"]] diff --git a/src/engine.py b/src/engine.py index 99d9132..c12a24b 100644 --- a/src/engine.py +++ b/src/engine.py @@ -15,10 +15,10 @@ from vllm.entrypoints.openai.serving_models import BaseModelPath, LoRAModulePath, OpenAIServingModels -from utils import DummyRequest, JobInput, BatchSize, create_error_response -from constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE -from tokenizer import TokenizerWrapper -from engine_args import get_engine_args +from .utils import DummyRequest, JobInput, BatchSize, create_error_response +from .constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE +from .tokenizer import TokenizerWrapper +from .engine_args import get_engine_args class vLLMEngine: def __init__(self, engine = None): @@ -204,7 +204,7 @@ def _load_lora_adapters(self): async def _initialize_engines(self): self.model_config = await self.llm.get_model_config() self.base_model_paths = [ - BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model) + BaseModelPath(name=self.served_model_name, model_path=self.engine_args.model) ] self.serving_models = OpenAIServingModels( diff --git a/src/engine_args.py b/src/engine_args.py index b7cc991..08e047a 100644 --- a/src/engine_args.py +++ b/src/engine_args.py @@ -4,7 +4,13 @@ from torch.cuda import device_count from vllm import AsyncEngineArgs from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from src.utils import convert_limit_mm_per_prompt + +try: + # Try relative imports (when installed as package) + from .utils import convert_limit_mm_per_prompt +except ImportError: + # Fall back to absolute imports (when running directly) + from utils import convert_limit_mm_per_prompt RENAME_ARGS_MAP = { "MODEL_NAME": "model", diff --git a/src/handler.py b/src/handler.py index 176ec7e..2e67271 100644 --- a/src/handler.py +++ b/src/handler.py @@ -1,7 +1,14 @@ import os import runpod -from utils import JobInput -from engine import vLLMEngine, OpenAIvLLMEngine + +try: + # Try relative imports (when installed as package) + from .utils import JobInput + from .engine import vLLMEngine, OpenAIvLLMEngine +except ImportError: + # Fall back to absolute imports (when running directly) + from utils import JobInput + from engine import vLLMEngine, OpenAIvLLMEngine vllm_engine = vLLMEngine() OpenAIvLLMEngine = OpenAIvLLMEngine(vllm_engine) diff --git a/src/handler_custom.py b/src/handler_custom.py new file mode 100644 index 0000000..20eb3cd --- /dev/null +++ b/src/handler_custom.py @@ -0,0 +1,230 @@ +import os +import runpod +import logging +import uvicorn +from fastapi import FastAPI, Request, HTTPException +from fastapi.responses import StreamingResponse, JSONResponse +import json + +try: + # Try relative imports (when installed as package) + from .utils import JobInput + from .engine import vLLMEngine, OpenAIvLLMEngine +except ImportError: + # Fall back to absolute imports (when running directly) + from utils import JobInput + from engine import vLLMEngine, OpenAIvLLMEngine +try: + from importlib.metadata import version, PackageNotFoundError +except ImportError: + # Python < 3.8 + from importlib_metadata import version, PackageNotFoundError + +try: + __version__ = version("vllm-worker") +except PackageNotFoundError: + # Package is not installed, fallback to reading VERSION file + from pathlib import Path + _version_file = Path(__file__).parent.parent / "VERSION" + __version__ = _version_file.read_text().strip() + + +# Bypass model_name since runpod cannot set model_name to local or network volume +if os.getenv("MODEL_NAME_OVERRIDE"): + os.environ["MODEL_NAME"] = str(os.getenv("MODEL_NAME_OVERRIDE")) + + +if os.getenv("ENABLE_MODEL_PATCH"): + + import sys + import importlib + + # Get MODEL_NAME path and go back 1 directory + model_path = str(os.getenv("MODEL_NAME")) + parent_dir = os.path.dirname(model_path) + + # Get the directory name of MODEL_NAME to use as module name + model_dir_name = os.path.basename(model_path) + + # Add parent directory to sys.path + sys.path.append(os.path.abspath(parent_dir)) + + # Dynamic import: from {model_dir_name} import modeling_dots_ocr_vllm + module = importlib.import_module(f"{model_dir_name}.modeling_dots_ocr_vllm") + +vllm_engine = vLLMEngine() +openai_vllm_engine = OpenAIvLLMEngine(vllm_engine) + +# Use the MODEL environment variable; fallback to a default if not set +mode_to_run = os.getenv("MODE_TO_RUN", "pod") + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +print("------- ENVIRONMENT VARIABLES -------") +print("Mode running: ", mode_to_run) +print("------- -------------------- -------") + +# Create FastAPI app +app = FastAPI(title="vLLM OpenAI-Compatible API", version=__version__) + + +async def handler(job): + job_input = JobInput(job["input"]) + engine = openai_vllm_engine if job_input.openai_route else vllm_engine + results_generator = engine.generate(job_input) + async for batch in results_generator: + yield batch + +# FastAPI endpoints for OpenAI compatibility +@app.get("/openai/v1/models") +@app.get("/v1/models") +async def get_models(): + """Get available models""" + try: + job_input = JobInput({ + "openai_route": "/v1/models", + "openai_input": {} + }) + result_generator = openai_vllm_engine.generate(job_input) + async for result in result_generator: + return JSONResponse(content=result) + except Exception as e: + logger.error(f"Error in get_models: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/openai/v1/chat/completions") +@app.post("/v1/chat/completions") +async def chat_completions(request: Request): + """Handle chat completions""" + try: + # Parse request body + request_data = await request.json() + + # Create JobInput for OpenAI engine + job_input = JobInput({ + "openai_route": "/v1/chat/completions", + "openai_input": request_data + }) + + # Check if streaming is requested + is_streaming = request_data.get("stream", False) + + if is_streaming: + # Return streaming response + async def stream_generator(): + result_generator = openai_vllm_engine.generate(job_input) + async for result in result_generator: + if isinstance(result, str): + # Raw OpenAI output format + yield result + elif isinstance(result, list): + # Batch of responses + for item in result: + if isinstance(item, str): + yield item + else: + yield f"data: {json.dumps(item)}\n\n" + else: + # Single response object + yield f"data: {json.dumps(result)}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse( + stream_generator(), + media_type="text/plain", + headers={"Cache-Control": "no-cache", "Connection": "keep-alive"} + ) + else: + # Return non-streaming response + result_generator = openai_vllm_engine.generate(job_input) + async for result in result_generator: + return JSONResponse(content=result) + + except Exception as e: + logger.error(f"Error in chat_completions: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/openai/v1/completions") +@app.post("/v1/completions") +async def completions(request: Request): + """Handle text completions""" + try: + # Parse request body + request_data = await request.json() + + # Create JobInput for OpenAI engine + job_input = JobInput({ + "openai_route": "/v1/completions", + "openai_input": request_data + }) + + # Check if streaming is requested + is_streaming = request_data.get("stream", False) + + if is_streaming: + # Return streaming response + async def stream_generator(): + result_generator = openai_vllm_engine.generate(job_input) + async for result in result_generator: + if isinstance(result, str): + # Raw OpenAI output format + yield result + elif isinstance(result, list): + # Batch of responses + for item in result: + if isinstance(item, str): + yield item + else: + yield f"data: {json.dumps(item)}\n\n" + else: + # Single response object + yield f"data: {json.dumps(result)}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse( + stream_generator(), + media_type="text/plain", + headers={"Cache-Control": "no-cache", "Connection": "keep-alive"} + ) + else: + # Return non-streaming response + result_generator = openai_vllm_engine.generate(job_input) + async for result in result_generator: + return JSONResponse(content=result) + + except Exception as e: + logger.error(f"Error in completions: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +# Health check endpoints +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return {"status": "healthy"} + +@app.get("/ping") +async def ping(): + """Simple ping endpoint""" + return {"status": "healthy"} + +if mode_to_run == "pod": + # Get ports from environment variables + port = int(os.getenv("PORT", 8000)) + logger.info(f"Starting vLLM server on port {port}") + + uvicorn.run( + app, + host="0.0.0.0", + port=port, + log_level=os.getenv("LOG_LEVEL", "INFO").lower() + ) +else: + runpod.serverless.start( + { + "handler": handler, + "concurrency_modifier": lambda _: vllm_engine.max_concurrency, + "return_aggregate_stream": True, + } + ) \ No newline at end of file