Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions runner/app/pipelines/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,6 @@ def __init__(
tensor_parallel_size = int(os.getenv("TENSOR_PARALLEL_SIZE", "1"))
pipeline_parallel_size = int(os.getenv("PIPELINE_PARALLEL_SIZE", "1"))

if max_num_batched_tokens < max_model_len:
max_num_batched_tokens = max_model_len
logger.info(
f"max_num_batched_tokens ({max_num_batched_tokens}) is smaller than max_model_len ({max_model_len}). This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.")
logger.info(f"setting 'max_model_len' to equal 'max_num_batched_tokens'")

# Load config to check model compatibility
try:
config = AutoConfig.from_pretrained(self.local_model_path)
Expand Down Expand Up @@ -118,6 +112,8 @@ def __init__(
logger.info(f"Using pipeline parallel size: {pipeline_parallel_size}")
logger.info(f"Total GPUs used: {total_gpus_needed}")

except AttributeError as ae:
logger.error(f"Cannot confirm tensor and pipeline parallelism. Confirm manually. ({ae})")
except Exception as e:
logger.error(f"Error in parallelism configuration: {e}")
raise
Expand Down
6 changes: 6 additions & 0 deletions runner/app/routes/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
)
@router.post("/llm/", response_model=LLMResponse
, responses=RESPONSES, include_in_schema=False)
@router.post(
"/llm/chat/completions",
response_model=LLMResponse,
responses=RESPONSES,
include_in_schema=False,
)
async def llm(
request: LLMRequest,
pipeline: Pipeline = Depends(get_pipeline),
Expand Down
8 changes: 4 additions & 4 deletions runner/docker/Dockerfile.llm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Based on https://github.com/huggingface/api-inference-community/blob/main/docker_images/diffusers/Dockerfile

FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu20.04
LABEL maintainer="Yondon Fu <yondon@livepeer.org>"
FROM nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04


# Add any system dependency here
# RUN apt-get update -y && apt-get install libXXX -y
Expand Down Expand Up @@ -30,8 +30,8 @@ RUN pyenv install $PYTHON_VERSION && \

# Upgrade pip and install your desired packages
ARG PIP_VERSION=24.2
RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools==69.5.1 wheel==0.43.0 && \
pip install --no-cache-dir torch==2.4.0 torchvision torchaudio pip-tools
RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools>=69.5.1 wheel>=0.43.0 && \
pip install --no-cache-dir torch>=2.4.0 torchvision torchaudio pip-tools

ARG VERSION="undefined"
ENV VERSION=${VERSION}
Expand Down
2 changes: 1 addition & 1 deletion runner/requirements.llm.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
vllm==0.10.0
vllm==0.11.0
diffusers
accelerate
transformers
Expand Down
Loading
Loading