diff --git a/runner/app/pipelines/llm.py b/runner/app/pipelines/llm.py
index 3b6f477a7..1a9fcdbfb 100644
--- a/runner/app/pipelines/llm.py
+++ b/runner/app/pipelines/llm.py
@@ -76,12 +76,6 @@ def __init__(
         tensor_parallel_size = int(os.getenv("TENSOR_PARALLEL_SIZE", "1"))
         pipeline_parallel_size = int(os.getenv("PIPELINE_PARALLEL_SIZE", "1"))
 
-        if max_num_batched_tokens < max_model_len:
-            max_num_batched_tokens = max_model_len
-            logger.info(
-                f"max_num_batched_tokens ({max_num_batched_tokens}) is smaller than max_model_len ({max_model_len}). This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.")
-            logger.info(f"setting 'max_model_len' to equal 'max_num_batched_tokens'")
-
     # Load config to check model compatibility
         try:
             config = AutoConfig.from_pretrained(self.local_model_path)
@@ -118,6 +112,8 @@ def __init__(
             logger.info(f"Using pipeline parallel size: {pipeline_parallel_size}")
             logger.info(f"Total GPUs used: {total_gpus_needed}")
 
+        except AttributeError as ae:
+            logger.error(f"Cannot confirm tensor and pipeline parallelism. Confirm manually. ({ae})")
         except Exception as e:
             logger.error(f"Error in parallelism configuration: {e}")
             raise
diff --git a/runner/app/routes/llm.py b/runner/app/routes/llm.py
index 8d95fdf04..63a1e5e17 100644
--- a/runner/app/routes/llm.py
+++ b/runner/app/routes/llm.py
@@ -32,6 +32,12 @@
 )
 @router.post("/llm/", response_model=LLMResponse
 , responses=RESPONSES, include_in_schema=False)
+@router.post(
+    "/llm/chat/completions",
+    response_model=LLMResponse,
+    responses=RESPONSES,
+    include_in_schema=False,
+)
 async def llm(
     request: LLMRequest,
     pipeline: Pipeline = Depends(get_pipeline),
diff --git a/runner/docker/Dockerfile.llm b/runner/docker/Dockerfile.llm
index e6ac94856..b289d7184 100644
--- a/runner/docker/Dockerfile.llm
+++ b/runner/docker/Dockerfile.llm
@@ -1,7 +1,7 @@
 # Based on https://github.com/huggingface/api-inference-community/blob/main/docker_images/diffusers/Dockerfile
 
-FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu20.04
-LABEL maintainer="Yondon Fu <yondon@livepeer.org>"
+FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
+
 
 # Add any system dependency here
 # RUN apt-get update -y && apt-get install libXXX -y
@@ -30,8 +30,8 @@ RUN pyenv install $PYTHON_VERSION && \
 
 # Upgrade pip and install your desired packages
 ARG PIP_VERSION=24.2
-RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools==69.5.1 wheel==0.43.0 && \
-    pip install --no-cache-dir torch==2.4.0 torchvision torchaudio pip-tools
+RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools>=69.5.1 wheel>=0.43.0 && \
+    pip install --no-cache-dir torch>=2.4.0 torchvision torchaudio pip-tools
 
 ARG VERSION="undefined"
 ENV VERSION=${VERSION}
diff --git a/runner/requirements.llm.in b/runner/requirements.llm.in
index cb7bb5244..5d6be8b0e 100644
--- a/runner/requirements.llm.in
+++ b/runner/requirements.llm.in
@@ -1,4 +1,4 @@
-vllm==0.10.0
+vllm==0.11.0
 diffusers
 accelerate
 transformers
diff --git a/runner/requirements.llm.txt b/runner/requirements.llm.txt
deleted file mode 100644
index 485b0dbd6..000000000
--- a/runner/requirements.llm.txt
+++ /dev/null
@@ -1,440 +0,0 @@
-#
-# This file is autogenerated by pip-compile with Python 3.11
-# by the following command:
-#
-#    pip-compile --output-file=requirements.llm.txt requirements.llm.in
-#
-accelerate==1.2.1
-    # via
-    #   -r requirements.llm.in
-    #   peft
-aiohappyeyeballs==2.4.4
-    # via aiohttp
-aiohttp==3.11.11
-    # via vllm
-aiosignal==1.3.2
-    # via
-    #   aiohttp
-    #   ray
-airportsdata==20241001
-    # via outlines
-annotated-types==0.7.0
-    # via pydantic
-anyio==4.8.0
-    # via
-    #   httpx
-    #   openai
-    #   starlette
-    #   watchfiles
-astor==0.8.1
-    # via depyf
-attrs==24.3.0
-    # via
-    #   aiohttp
-    #   jsonschema
-    #   referencing
-av==14.0.1
-    # via -r requirements.llm.in
-bitsandbytes==0.45.0
-    # via -r requirements.llm.in
-blake3==1.0.1
-    # via vllm
-certifi==2024.12.14
-    # via
-    #   httpcore
-    #   httpx
-    #   requests
-charset-normalizer==3.4.1
-    # via requests
-click==8.1.8
-    # via
-    #   ray
-    #   uvicorn
-cloudpickle==3.1.0
-    # via outlines
-compressed-tensors==0.8.1
-    # via vllm
-deepcache==0.1.1
-    # via -r requirements.llm.in
-depyf==0.18.0
-    # via vllm
-diffusers==0.32.1
-    # via
-    #   -r requirements.llm.in
-    #   deepcache
-dill==0.3.9
-    # via depyf
-diskcache==5.6.3
-    # via outlines
-distro==1.9.0
-    # via openai
-einops==0.8.0
-    # via vllm
-fastapi==0.115.6
-    # via
-    #   -r requirements.llm.in
-    #   vllm
-filelock==3.16.1
-    # via
-    #   diffusers
-    #   huggingface-hub
-    #   ray
-    #   torch
-    #   transformers
-    #   triton
-    #   vllm
-frozenlist==1.5.0
-    # via
-    #   aiohttp
-    #   aiosignal
-    #   ray
-fsspec==2024.12.0
-    # via
-    #   huggingface-hub
-    #   torch
-gguf==0.10.0
-    # via vllm
-h11==0.14.0
-    # via
-    #   httpcore
-    #   uvicorn
-httpcore==1.0.7
-    # via httpx
-httptools==0.6.4
-    # via uvicorn
-httpx==0.28.1
-    # via openai
-huggingface-hub==0.27.1
-    # via
-    #   -r requirements.llm.in
-    #   accelerate
-    #   diffusers
-    #   peft
-    #   tokenizers
-    #   transformers
-idna==3.10
-    # via
-    #   anyio
-    #   httpx
-    #   requests
-    #   yarl
-importlib-metadata==8.5.0
-    # via
-    #   diffusers
-    #   vllm
-iniconfig==2.0.0
-    # via pytest
-interegular==0.3.3
-    # via
-    #   lm-format-enforcer
-    #   outlines
-    #   outlines-core
-jinja2==3.1.5
-    # via
-    #   outlines
-    #   torch
-jiter==0.8.2
-    # via openai
-jsonschema==4.23.0
-    # via
-    #   mistral-common
-    #   outlines
-    #   outlines-core
-    #   ray
-jsonschema-specifications==2024.10.1
-    # via jsonschema
-lark==1.2.2
-    # via outlines
-lm-format-enforcer==0.10.9
-    # via vllm
-markupsafe==3.0.2
-    # via jinja2
-mistral-common[opencv]==1.5.1
-    # via
-    #   mistral-common
-    #   vllm
-mpmath==1.3.0
-    # via sympy
-msgpack==1.1.0
-    # via ray
-msgspec==0.19.0
-    # via vllm
-multidict==6.1.0
-    # via
-    #   aiohttp
-    #   yarl
-nest-asyncio==1.6.0
-    # via outlines
-networkx==3.4.2
-    # via torch
-numpy==1.26.4
-    # via
-    #   -r requirements.llm.in
-    #   accelerate
-    #   bitsandbytes
-    #   diffusers
-    #   gguf
-    #   mistral-common
-    #   opencv-python-headless
-    #   outlines
-    #   peft
-    #   scipy
-    #   torchvision
-    #   transformers
-    #   vllm
-    #   xformers
-nvidia-cublas-cu12==12.4.5.8
-    # via
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cuda-cupti-cu12==12.4.127
-    # via torch
-nvidia-cuda-nvrtc-cu12==12.4.127
-    # via torch
-nvidia-cuda-runtime-cu12==12.4.127
-    # via torch
-nvidia-cudnn-cu12==9.1.0.70
-    # via torch
-nvidia-cufft-cu12==11.2.1.3
-    # via torch
-nvidia-curand-cu12==10.3.5.147
-    # via torch
-nvidia-cusolver-cu12==11.6.1.9
-    # via torch
-nvidia-cusparse-cu12==12.3.1.170
-    # via
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-ml-py==12.560.30
-    # via vllm
-nvidia-nccl-cu12==2.21.5
-    # via torch
-nvidia-nvjitlink-cu12==12.4.127
-    # via
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
-    #   torch
-nvidia-nvtx-cu12==12.4.127
-    # via torch
-openai==1.59.6
-    # via vllm
-opencv-python-headless==4.10.0.84
-    # via mistral-common
-outlines==0.1.11
-    # via vllm
-outlines-core==0.1.26
-    # via outlines
-packaging==24.2
-    # via
-    #   accelerate
-    #   huggingface-hub
-    #   lm-format-enforcer
-    #   peft
-    #   pytest
-    #   ray
-    #   transformers
-partial-json-parser==0.2.1.1.post5
-    # via vllm
-peft==0.14.0
-    # via -r requirements.llm.in
-pillow==10.4.0
-    # via
-    #   -r requirements.llm.in
-    #   diffusers
-    #   mistral-common
-    #   torchvision
-    #   vllm
-pluggy==1.5.0
-    # via pytest
-prometheus-client==0.21.1
-    # via
-    #   prometheus-fastapi-instrumentator
-    #   vllm
-prometheus-fastapi-instrumentator==7.0.0
-    # via vllm
-propcache==0.2.1
-    # via
-    #   aiohttp
-    #   yarl
-protobuf==5.29.3
-    # via
-    #   -r requirements.llm.in
-    #   ray
-    #   vllm
-psutil==6.1.1
-    # via
-    #   -r requirements.llm.in
-    #   accelerate
-    #   peft
-    #   vllm
-py-cpuinfo==9.0.0
-    # via vllm
-pybind11==2.13.6
-    # via xgrammar
-pycountry==24.6.1
-    # via outlines
-pydantic==2.10.5
-    # via
-    #   -r requirements.llm.in
-    #   compressed-tensors
-    #   fastapi
-    #   lm-format-enforcer
-    #   mistral-common
-    #   openai
-    #   outlines
-    #   vllm
-    #   xgrammar
-pydantic-core==2.27.2
-    # via pydantic
-pytest==8.3.4
-    # via xgrammar
-python-dotenv==1.0.1
-    # via uvicorn
-python-multipart==0.0.20
-    # via -r requirements.llm.in
-pyyaml==6.0.2
-    # via
-    #   accelerate
-    #   gguf
-    #   huggingface-hub
-    #   lm-format-enforcer
-    #   peft
-    #   ray
-    #   transformers
-    #   uvicorn
-    #   vllm
-pyzmq==26.2.0
-    # via vllm
-ray==2.40.0
-    # via vllm
-referencing==0.35.1
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-    #   outlines
-regex==2024.11.6
-    # via
-    #   diffusers
-    #   tiktoken
-    #   transformers
-requests==2.32.3
-    # via
-    #   diffusers
-    #   huggingface-hub
-    #   mistral-common
-    #   outlines
-    #   ray
-    #   tiktoken
-    #   transformers
-    #   vllm
-rpds-py==0.22.3
-    # via
-    #   jsonschema
-    #   referencing
-safetensors==0.5.2
-    # via
-    #   -r requirements.llm.in
-    #   accelerate
-    #   diffusers
-    #   peft
-    #   transformers
-scipy==1.15.0
-    # via -r requirements.llm.in
-sentencepiece==0.2.0
-    # via
-    #   -r requirements.llm.in
-    #   mistral-common
-    #   vllm
-    #   xgrammar
-sniffio==1.3.1
-    # via
-    #   anyio
-    #   openai
-starlette==0.41.3
-    # via
-    #   fastapi
-    #   prometheus-fastapi-instrumentator
-sympy==1.13.1
-    # via torch
-tiktoken==0.7.0
-    # via
-    #   mistral-common
-    #   vllm
-    #   xgrammar
-tokenizers==0.21.0
-    # via
-    #   transformers
-    #   vllm
-torch==2.5.1
-    # via
-    #   accelerate
-    #   bitsandbytes
-    #   compressed-tensors
-    #   deepcache
-    #   outlines
-    #   peft
-    #   torchvision
-    #   vllm
-    #   xformers
-    #   xgrammar
-torchvision==0.20.1
-    # via vllm
-tqdm==4.67.1
-    # via
-    #   gguf
-    #   huggingface-hub
-    #   openai
-    #   outlines
-    #   peft
-    #   transformers
-    #   vllm
-transformers==4.48.0
-    # via
-    #   -r requirements.llm.in
-    #   compressed-tensors
-    #   deepcache
-    #   peft
-    #   vllm
-    #   xgrammar
-triton==3.1.0
-    # via
-    #   -r requirements.llm.in
-    #   torch
-typing-extensions==4.12.2
-    # via
-    #   anyio
-    #   bitsandbytes
-    #   fastapi
-    #   huggingface-hub
-    #   mistral-common
-    #   openai
-    #   outlines
-    #   pydantic
-    #   pydantic-core
-    #   torch
-    #   vllm
-urllib3==2.3.0
-    # via requests
-uvicorn[standard]==0.34.0
-    # via
-    #   -r requirements.llm.in
-    #   vllm
-uvloop==0.21.0
-    # via uvicorn
-vllm==0.6.5
-    # via -r requirements.llm.in
-watchfiles==1.0.4
-    # via uvicorn
-websockets==14.1
-    # via uvicorn
-xformers==0.0.28.post3
-    # via
-    #   -r requirements.llm.in
-    #   vllm
-xgrammar==0.1.9
-    # via vllm
-yarl==1.18.3
-    # via aiohttp
-zipp==3.21.0
-    # via importlib-metadata