diff --git a/runner/app/pipelines/llm.py b/runner/app/pipelines/llm.py index 3b6f477a7..1a9fcdbfb 100644 --- a/runner/app/pipelines/llm.py +++ b/runner/app/pipelines/llm.py @@ -76,12 +76,6 @@ def __init__( tensor_parallel_size = int(os.getenv("TENSOR_PARALLEL_SIZE", "1")) pipeline_parallel_size = int(os.getenv("PIPELINE_PARALLEL_SIZE", "1")) - if max_num_batched_tokens < max_model_len: - max_num_batched_tokens = max_model_len - logger.info( - f"max_num_batched_tokens ({max_num_batched_tokens}) is smaller than max_model_len ({max_model_len}). This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.") - logger.info(f"setting 'max_model_len' to equal 'max_num_batched_tokens'") - # Load config to check model compatibility try: config = AutoConfig.from_pretrained(self.local_model_path) @@ -118,6 +112,8 @@ def __init__( logger.info(f"Using pipeline parallel size: {pipeline_parallel_size}") logger.info(f"Total GPUs used: {total_gpus_needed}") + except AttributeError as ae: + logger.error(f"Cannot confirm tensor and pipeline parallelism. Confirm manually. ({ae})") except Exception as e: logger.error(f"Error in parallelism configuration: {e}") raise diff --git a/runner/app/routes/llm.py b/runner/app/routes/llm.py index 8d95fdf04..63a1e5e17 100644 --- a/runner/app/routes/llm.py +++ b/runner/app/routes/llm.py @@ -32,6 +32,12 @@ ) @router.post("/llm/", response_model=LLMResponse , responses=RESPONSES, include_in_schema=False) +@router.post( + "/llm/chat/completions", + response_model=LLMResponse, + responses=RESPONSES, + include_in_schema=False, +) async def llm( request: LLMRequest, pipeline: Pipeline = Depends(get_pipeline), diff --git a/runner/docker/Dockerfile.llm b/runner/docker/Dockerfile.llm index e6ac94856..b289d7184 100644 --- a/runner/docker/Dockerfile.llm +++ b/runner/docker/Dockerfile.llm @@ -1,7 +1,7 @@ # Based on https://github.com/huggingface/api-inference-community/blob/main/docker_images/diffusers/Dockerfile -FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu20.04 -LABEL maintainer="Yondon Fu " +FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 + # Add any system dependency here # RUN apt-get update -y && apt-get install libXXX -y @@ -30,8 +30,8 @@ RUN pyenv install $PYTHON_VERSION && \ # Upgrade pip and install your desired packages ARG PIP_VERSION=24.2 -RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools==69.5.1 wheel==0.43.0 && \ - pip install --no-cache-dir torch==2.4.0 torchvision torchaudio pip-tools +RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools>=69.5.1 wheel>=0.43.0 && \ + pip install --no-cache-dir torch>=2.4.0 torchvision torchaudio pip-tools ARG VERSION="undefined" ENV VERSION=${VERSION} diff --git a/runner/requirements.llm.in b/runner/requirements.llm.in index cb7bb5244..5d6be8b0e 100644 --- a/runner/requirements.llm.in +++ b/runner/requirements.llm.in @@ -1,4 +1,4 @@ -vllm==0.10.0 +vllm==0.11.0 diffusers accelerate transformers diff --git a/runner/requirements.llm.txt b/runner/requirements.llm.txt deleted file mode 100644 index 485b0dbd6..000000000 --- a/runner/requirements.llm.txt +++ /dev/null @@ -1,440 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.11 -# by the following command: -# -# pip-compile --output-file=requirements.llm.txt requirements.llm.in -# -accelerate==1.2.1 - # via - # -r requirements.llm.in - # peft -aiohappyeyeballs==2.4.4 - # via aiohttp -aiohttp==3.11.11 - # via vllm -aiosignal==1.3.2 - # via - # aiohttp - # ray -airportsdata==20241001 - # via outlines -annotated-types==0.7.0 - # via pydantic -anyio==4.8.0 - # via - # httpx - # openai - # starlette - # watchfiles -astor==0.8.1 - # via depyf -attrs==24.3.0 - # via - # aiohttp - # jsonschema - # referencing -av==14.0.1 - # via -r requirements.llm.in -bitsandbytes==0.45.0 - # via -r requirements.llm.in -blake3==1.0.1 - # via vllm -certifi==2024.12.14 - # via - # httpcore - # httpx - # requests -charset-normalizer==3.4.1 - # via requests -click==8.1.8 - # via - # ray - # uvicorn -cloudpickle==3.1.0 - # via outlines -compressed-tensors==0.8.1 - # via vllm -deepcache==0.1.1 - # via -r requirements.llm.in -depyf==0.18.0 - # via vllm -diffusers==0.32.1 - # via - # -r requirements.llm.in - # deepcache -dill==0.3.9 - # via depyf -diskcache==5.6.3 - # via outlines -distro==1.9.0 - # via openai -einops==0.8.0 - # via vllm -fastapi==0.115.6 - # via - # -r requirements.llm.in - # vllm -filelock==3.16.1 - # via - # diffusers - # huggingface-hub - # ray - # torch - # transformers - # triton - # vllm -frozenlist==1.5.0 - # via - # aiohttp - # aiosignal - # ray -fsspec==2024.12.0 - # via - # huggingface-hub - # torch -gguf==0.10.0 - # via vllm -h11==0.14.0 - # via - # httpcore - # uvicorn -httpcore==1.0.7 - # via httpx -httptools==0.6.4 - # via uvicorn -httpx==0.28.1 - # via openai -huggingface-hub==0.27.1 - # via - # -r requirements.llm.in - # accelerate - # diffusers - # peft - # tokenizers - # transformers -idna==3.10 - # via - # anyio - # httpx - # requests - # yarl -importlib-metadata==8.5.0 - # via - # diffusers - # vllm -iniconfig==2.0.0 - # via pytest -interegular==0.3.3 - # via - # lm-format-enforcer - # outlines - # outlines-core -jinja2==3.1.5 - # via - # outlines - # torch -jiter==0.8.2 - # via openai -jsonschema==4.23.0 - # via - # mistral-common - # outlines - # outlines-core - # ray -jsonschema-specifications==2024.10.1 - # via jsonschema -lark==1.2.2 - # via outlines -lm-format-enforcer==0.10.9 - # via vllm -markupsafe==3.0.2 - # via jinja2 -mistral-common[opencv]==1.5.1 - # via - # mistral-common - # vllm -mpmath==1.3.0 - # via sympy -msgpack==1.1.0 - # via ray -msgspec==0.19.0 - # via vllm -multidict==6.1.0 - # via - # aiohttp - # yarl -nest-asyncio==1.6.0 - # via outlines -networkx==3.4.2 - # via torch -numpy==1.26.4 - # via - # -r requirements.llm.in - # accelerate - # bitsandbytes - # diffusers - # gguf - # mistral-common - # opencv-python-headless - # outlines - # peft - # scipy - # torchvision - # transformers - # vllm - # xformers -nvidia-cublas-cu12==12.4.5.8 - # via - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 - # torch -nvidia-cuda-cupti-cu12==12.4.127 - # via torch -nvidia-cuda-nvrtc-cu12==12.4.127 - # via torch -nvidia-cuda-runtime-cu12==12.4.127 - # via torch -nvidia-cudnn-cu12==9.1.0.70 - # via torch -nvidia-cufft-cu12==11.2.1.3 - # via torch -nvidia-curand-cu12==10.3.5.147 - # via torch -nvidia-cusolver-cu12==11.6.1.9 - # via torch -nvidia-cusparse-cu12==12.3.1.170 - # via - # nvidia-cusolver-cu12 - # torch -nvidia-ml-py==12.560.30 - # via vllm -nvidia-nccl-cu12==2.21.5 - # via torch -nvidia-nvjitlink-cu12==12.4.127 - # via - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 - # torch -nvidia-nvtx-cu12==12.4.127 - # via torch -openai==1.59.6 - # via vllm -opencv-python-headless==4.10.0.84 - # via mistral-common -outlines==0.1.11 - # via vllm -outlines-core==0.1.26 - # via outlines -packaging==24.2 - # via - # accelerate - # huggingface-hub - # lm-format-enforcer - # peft - # pytest - # ray - # transformers -partial-json-parser==0.2.1.1.post5 - # via vllm -peft==0.14.0 - # via -r requirements.llm.in -pillow==10.4.0 - # via - # -r requirements.llm.in - # diffusers - # mistral-common - # torchvision - # vllm -pluggy==1.5.0 - # via pytest -prometheus-client==0.21.1 - # via - # prometheus-fastapi-instrumentator - # vllm -prometheus-fastapi-instrumentator==7.0.0 - # via vllm -propcache==0.2.1 - # via - # aiohttp - # yarl -protobuf==5.29.3 - # via - # -r requirements.llm.in - # ray - # vllm -psutil==6.1.1 - # via - # -r requirements.llm.in - # accelerate - # peft - # vllm -py-cpuinfo==9.0.0 - # via vllm -pybind11==2.13.6 - # via xgrammar -pycountry==24.6.1 - # via outlines -pydantic==2.10.5 - # via - # -r requirements.llm.in - # compressed-tensors - # fastapi - # lm-format-enforcer - # mistral-common - # openai - # outlines - # vllm - # xgrammar -pydantic-core==2.27.2 - # via pydantic -pytest==8.3.4 - # via xgrammar -python-dotenv==1.0.1 - # via uvicorn -python-multipart==0.0.20 - # via -r requirements.llm.in -pyyaml==6.0.2 - # via - # accelerate - # gguf - # huggingface-hub - # lm-format-enforcer - # peft - # ray - # transformers - # uvicorn - # vllm -pyzmq==26.2.0 - # via vllm -ray==2.40.0 - # via vllm -referencing==0.35.1 - # via - # jsonschema - # jsonschema-specifications - # outlines -regex==2024.11.6 - # via - # diffusers - # tiktoken - # transformers -requests==2.32.3 - # via - # diffusers - # huggingface-hub - # mistral-common - # outlines - # ray - # tiktoken - # transformers - # vllm -rpds-py==0.22.3 - # via - # jsonschema - # referencing -safetensors==0.5.2 - # via - # -r requirements.llm.in - # accelerate - # diffusers - # peft - # transformers -scipy==1.15.0 - # via -r requirements.llm.in -sentencepiece==0.2.0 - # via - # -r requirements.llm.in - # mistral-common - # vllm - # xgrammar -sniffio==1.3.1 - # via - # anyio - # openai -starlette==0.41.3 - # via - # fastapi - # prometheus-fastapi-instrumentator -sympy==1.13.1 - # via torch -tiktoken==0.7.0 - # via - # mistral-common - # vllm - # xgrammar -tokenizers==0.21.0 - # via - # transformers - # vllm -torch==2.5.1 - # via - # accelerate - # bitsandbytes - # compressed-tensors - # deepcache - # outlines - # peft - # torchvision - # vllm - # xformers - # xgrammar -torchvision==0.20.1 - # via vllm -tqdm==4.67.1 - # via - # gguf - # huggingface-hub - # openai - # outlines - # peft - # transformers - # vllm -transformers==4.48.0 - # via - # -r requirements.llm.in - # compressed-tensors - # deepcache - # peft - # vllm - # xgrammar -triton==3.1.0 - # via - # -r requirements.llm.in - # torch -typing-extensions==4.12.2 - # via - # anyio - # bitsandbytes - # fastapi - # huggingface-hub - # mistral-common - # openai - # outlines - # pydantic - # pydantic-core - # torch - # vllm -urllib3==2.3.0 - # via requests -uvicorn[standard]==0.34.0 - # via - # -r requirements.llm.in - # vllm -uvloop==0.21.0 - # via uvicorn -vllm==0.6.5 - # via -r requirements.llm.in -watchfiles==1.0.4 - # via uvicorn -websockets==14.1 - # via uvicorn -xformers==0.0.28.post3 - # via - # -r requirements.llm.in - # vllm -xgrammar==0.1.9 - # via vllm -yarl==1.18.3 - # via aiohttp -zipp==3.21.0 - # via importlib-metadata