livepeer · ad-astra-video · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/runner/app/pipelines/llm.py b/runner/app/pipelines/llm.py
@@ -76,12 +76,6 @@ def __init__(
         tensor_parallel_size = int(os.getenv("TENSOR_PARALLEL_SIZE", "1"))
         pipeline_parallel_size = int(os.getenv("PIPELINE_PARALLEL_SIZE", "1"))
 
-        if max_num_batched_tokens < max_model_len:
-            max_num_batched_tokens = max_model_len
-            logger.info(
-                f"max_num_batched_tokens ({max_num_batched_tokens}) is smaller than max_model_len ({max_model_len}). This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.")
-            logger.info(f"setting 'max_model_len' to equal 'max_num_batched_tokens'")
-
     # Load config to check model compatibility
         try:
             config = AutoConfig.from_pretrained(self.local_model_path)
@@ -118,6 +112,8 @@ def __init__(
             logger.info(f"Using pipeline parallel size: {pipeline_parallel_size}")
             logger.info(f"Total GPUs used: {total_gpus_needed}")
 
+        except AttributeError as ae:
+            logger.error(f"Cannot confirm tensor and pipeline parallelism. Confirm manually. ({ae})")
         except Exception as e:
             logger.error(f"Error in parallelism configuration: {e}")
             raise

diff --git a/runner/app/routes/llm.py b/runner/app/routes/llm.py
@@ -32,6 +32,12 @@
 )
 @router.post("/llm/", response_model=LLMResponse
 , responses=RESPONSES, include_in_schema=False)
+@router.post(
+    "/llm/chat/completions",
+    response_model=LLMResponse,
+    responses=RESPONSES,
+    include_in_schema=False,
+)
 async def llm(
     request: LLMRequest,
     pipeline: Pipeline = Depends(get_pipeline),

diff --git a/runner/docker/Dockerfile.llm b/runner/docker/Dockerfile.llm
@@ -1,7 +1,7 @@
 # Based on https://github.com/huggingface/api-inference-community/blob/main/docker_images/diffusers/Dockerfile
 
-FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu20.04
-LABEL maintainer="Yondon Fu <yondon@livepeer.org>"
+FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
+
 
 # Add any system dependency here
 # RUN apt-get update -y && apt-get install libXXX -y
@@ -30,8 +30,8 @@ RUN pyenv install $PYTHON_VERSION && \
 
 # Upgrade pip and install your desired packages
 ARG PIP_VERSION=24.2
-RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools==69.5.1 wheel==0.43.0 && \
-    pip install --no-cache-dir torch==2.4.0 torchvision torchaudio pip-tools
+RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools>=69.5.1 wheel>=0.43.0 && \
+    pip install --no-cache-dir torch>=2.4.0 torchvision torchaudio pip-tools
 
 ARG VERSION="undefined"
 ENV VERSION=${VERSION}

diff --git a/runner/requirements.llm.in b/runner/requirements.llm.in
@@ -1,4 +1,4 @@
-vllm==0.10.0
+vllm==0.11.0
 diffusers
 accelerate
 transformers