diff --git a/README.md b/README.md index fe3c21c..5b6f20c 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ For detailed customization examples and best practices, refer to the [Neuron DLC | Framework | Neuron Packages | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL | Other Packages | |-----------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|--------------------|------------------------------|------------------------|--------------------------------------------------------------------------------------------|-------------------| -| [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/pytorch/inference/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-neuronx:2.9.0-neuronx-py312-sdk2.28.0-ubuntu24.04 | torchserve 0.11.0 | +| [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.29.0/pytorch/inference/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, nki | Neuron 2.29.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-neuronx:2.9.0-neuronx-py312-sdk2.29.0-ubuntu24.04 | torchserve 0.11.0 | +| [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/pytorch/inference/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, nki | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-neuronx:2.9.0-neuronx-py312-sdk2.28.0-ubuntu24.04 | torchserve 0.11.0 | | [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.27.1/pytorch/inference/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx | Neuron 2.27.1 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-neuronx:2.9.0-neuronx-py312-sdk2.27.1-ubuntu24.04 | torchserve 0.11.0 | | [PyTorch 2.8.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.26.1/pytorch/inference/2.8.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx | Neuron 2.26.1 | trn1,trn2,inf2 | 3.11 (py311) | public.ecr.aws/neuron/pytorch-inference-neuronx:2.8.0-neuronx-py311-sdk2.26.1-ubuntu22.04 | torchserve 0.11.0 | | [PyTorch 2.7.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.25.0/docker/pytorch/inference/2.7.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, transformers-neuronx | Neuron 2.25.0 | trn1,trn2,inf2 | 3.10 (py310) | public.ecr.aws/neuron/pytorch-inference-neuronx:2.7.0-neuronx-py310-sdk2.25.0-ubuntu22.04 | torchserve 0.11.0 | @@ -52,7 +53,8 @@ For detailed customization examples and best practices, refer to the [Neuron DLC | Framework | Neuron Packages | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL | |----------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------|--------------------|------------------------------|------------------------|-------------------------------------------------------------------------------------------| -| [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/pytorch/training/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-training-neuronx:2.9.0-neuronx-py312-sdk2.28.0-ubuntu24.04 | +| [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.29.0/pytorch/training/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx, nki | Neuron 2.29.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-training-neuronx:2.9.0-neuronx-py312-sdk2.29.0-ubuntu24.04 | +| [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/pytorch/training/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx, nki | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-training-neuronx:2.9.0-neuronx-py312-sdk2.28.0-ubuntu24.04 | | [PyTorch 2.9.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.27.1/pytorch/training/2.9.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.27.1 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-training-neuronx:2.9.0-neuronx-py312-sdk2.27.1-ubuntu24.04 | | [PyTorch 2.8.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.26.1/pytorch/training/2.8.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.26.1 | trn1,trn2,inf2 | 3.11 (py311) | public.ecr.aws/neuron/pytorch-training-neuronx:2.8.0-neuronx-py311-sdk2.26.1-ubuntu22.04 | | [PyTorch 2.7.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.25.0/docker/pytorch/training/2.7.0/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.25.0 | trn1,trn2,inf2 | 3.10 (py310) | public.ecr.aws/neuron/pytorch-training-neuronx:2.7.0-neuronx-py310-sdk2.25.0-ubuntu22.04 | @@ -65,7 +67,8 @@ For detailed customization examples and best practices, refer to the [Neuron DLC | Framework | Neuron Packages | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL | Other Packages | |----------------------------------------------------------------------------------------------------------------------------------------|---------------------------------|--------------------|------------------------------|------------------------|------------------------------------------------------------------------------------------|-------------------| -| [JAX 0.7](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/jax/training/0.7/Dockerfile.neuronx) | jax-neuronx, libneuronxla | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/jax-training-neuronx:0.7-neuronx-py312-sdk2.28.0-ubuntu24.04 | jaxlib 0.7 | +| [JAX 0.7](https://github.com/aws-neuron/deep-learning-containers/blob/2.29.0/jax/training/0.7/Dockerfile.neuronx) | jax-neuronx, libneuronxla, nki | Neuron 2.29.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/jax-training-neuronx:0.7-neuronx-py312-sdk2.29.0-ubuntu24.04 | jaxlib 0.7 | +| [JAX 0.7](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/jax/training/0.7/Dockerfile.neuronx) | jax-neuronx, libneuronxla, nki | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/jax-training-neuronx:0.7-neuronx-py312-sdk2.28.0-ubuntu24.04 | jaxlib 0.7 | | [JAX 0.7](https://github.com/aws-neuron/deep-learning-containers/blob/2.27.1/jax/training/0.7/Dockerfile.neuronx) | jax-neuronx, libneuronxla | Neuron 2.27.1 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/jax-training-neuronx:0.7-neuronx-py312-sdk2.27.1-ubuntu24.04 | jaxlib 0.7 | | [JAX 0.6](https://github.com/aws-neuron/deep-learning-containers/blob/2.26.1/jax/training/0.6/Dockerfile.neuronx) | jax-neuronx, libneuronxla | Neuron 2.26.1 | trn1,trn2,inf2 | 3.11 (py311) | public.ecr.aws/neuron/jax-training-neuronx:0.6-neuronx-py311-sdk2.26.1-ubuntu22.04 | jaxlib 0.6 | | [JAX 0.6](https://github.com/aws-neuron/deep-learning-containers/blob/2.25.0/docker/jax/training/0.6/Dockerfile.neuronx) | jax-neuronx, libneuronxla | Neuron 2.25.0 | trn1,trn2,inf2 | 3.10 (py310) | public.ecr.aws/neuron/jax-training-neuronx:0.6-neuronx-py310-sdk2.25.0-ubuntu22.04 | jaxlib 0.6 | @@ -75,6 +78,7 @@ For detailed customization examples and best practices, refer to the [Neuron DLC | Framework | Neuron Packages | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL | |----------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------|--------------------|------------------------------|------------------------|-------------------------------------------------------------------------------------------| +| [VLLM 0.16.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.29.0/vllm/inference/0.16.0/Dockerfile.neuronx) | vllm-neuronx, libneuronxla, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, aws-neuronx-tools, nki | Neuron 2.29.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.16.0-neuronx-py312-sdk2.29.0-ubuntu24.04 | | [VLLM 0.13.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.28.0/vllm/inference/0.13.0/Dockerfile.neuronx) | vllm-neuronx, libneuronxla, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, aws-neuronx-tools, nki | Neuron 2.28.0 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.13.0-neuronx-py312-sdk2.28.0-ubuntu24.04 | | [VLLM 0.13.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.27.1/vllm/inference/0.13.0/Dockerfile.neuronx) | vllm-neuronx, libneuronxla, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, aws-neuronx-tools, nki | Neuron 2.27.1 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.13.0-neuronx-py312-sdk2.27.1-ubuntu24.04 | | [VLLM 0.11.0](https://github.com/aws-neuron/deep-learning-containers/blob/2.27.1/vllm/inference/0.11.0/Dockerfile.neuronx) | vllm-neuronx, libneuronxla, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, aws-neuronx-tools | Neuron 2.27.1 | trn1,trn2,trn3,inf2 | 3.12 (py312) | public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.11.0-neuronx-py312-sdk2.27.1-ubuntu24.04 | diff --git a/jax/training/0.7/Dockerfile.neuronx b/jax/training/0.7/Dockerfile.neuronx index f601d94..21fa457 100644 --- a/jax/training/0.7/Dockerfile.neuronx +++ b/jax/training/0.7/Dockerfile.neuronx @@ -102,13 +102,18 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && ${PIP} --no-cache-dir install --upgrade \ - "awscli<2" \ pip \ requests \ setuptools \ uv \ && rm -rf ~/.cache/pip/* +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip awscliv2.zip \ + && ./aws/install \ + && rm -rf awscliv2.zip aws + # U24 will not allow installation of pip packages outside of venv without this flag # This is because U24 ships with Python 3.12 by default and installation into the Python # interpreter’s directory are disabled outside of a virtual environment. @@ -166,13 +171,13 @@ RUN mkdir -p /etc/apt/keyrings \ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg # Neuron SDK components version numbers -ARG NEURONX_RUNTIME_LIB_VERSION=2.30.51.0-faafe26f0 -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.30.59.0-f5cdefb39 -ARG NEURONX_TOOLS_VERSION=2.28.23.0-f1c114a9d +ARG NEURONX_RUNTIME_LIB_VERSION=2.31.24.0-0b044f4ce +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.31.24.0-1a31ba186 +ARG NEURONX_TOOLS_VERSION=2.29.18.0-d5fe7ba42 -ARG NEURONX_CC_VERSION=2.23.6484.0+3b612583 -ARG NEURONX_JAX_TRAINING_VERSION=0.7.0.1.0.7584+5c8f4c3c -ARG NKI_VERSION=0.2.0+g82fdb402 +ARG NEURONX_CC_VERSION=2.24.5133.0+58f8de22 +ARG NEURONX_JAX_TRAINING_VERSION=0.7.0.1.0.8181+1e892be0 +ARG NKI_VERSION=0.3.0+23928721754.g18aa1271 FROM base AS repo @@ -191,7 +196,7 @@ RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP --index-url ${PIP_REPO_URL} \ --extra-index-url ${PYPI_SIMPLE_URL} \ --trusted-host ${NEURON_PIP_REPO} \ - "neuronx-cc>=2.0" \ + neuronx-cc \ jax-neuronx \ nki \ && rm -rf ~/.cache/pip/* @@ -222,6 +227,12 @@ RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP FROM ${BUILD_STAGE} AS final +# Upgrade OS packages to latest versions +RUN apt-get update \ + && apt-get upgrade -y \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + # Starts framework ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] CMD ["/bin/bash"] diff --git a/pytorch/inference/2.9.0/Dockerfile.neuronx b/pytorch/inference/2.9.0/Dockerfile.neuronx index 244eedb..12db83c 100644 --- a/pytorch/inference/2.9.0/Dockerfile.neuronx +++ b/pytorch/inference/2.9.0/Dockerfile.neuronx @@ -117,17 +117,22 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip awscliv2.zip \ + && ./aws/install \ + && rm -rf awscliv2.zip aws + RUN ${PIP} install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ && ${PIP} install --no-cache-dir -U \ - "opencv-python>=4.8.1.78" \ - "scipy>=1.8.0" \ + opencv-python \ + scipy \ six \ - "awscli<2" \ pandas \ boto3 \ uv \ cryptography \ - "protobuf>=3.18.3,<4" \ + protobuf \ torchserve==${TORCHSERVE_VERSION} \ torch-model-archiver==${TORCHSERVE_VERSION} \ && rm -rf ~/.cache/pip/* @@ -176,15 +181,15 @@ RUN mkdir -p /etc/apt/keyrings \ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg # Neuron SDK components version numbers -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.30.59.0-f5cdefb39 -ARG NEURONX_RUNTIME_LIB_VERSION=2.30.51.0-faafe26f0 -ARG NEURONX_TOOLS_VERSION=2.28.23.0-f1c114a9d +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.31.24.0-1a31ba186 +ARG NEURONX_RUNTIME_LIB_VERSION=2.31.24.0-0b044f4ce +ARG NEURONX_TOOLS_VERSION=2.29.18.0-d5fe7ba42 -ARG NEURONX_CC_VERSION=2.23.6484.0+3b612583 -ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.12.22436+0f1dac25 -ARG NEURONX_DISTRIBUTED_VERSION=0.17.26814+4b18de63 -ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.8.16251+f3ca5575 -ARG NKI_VERSION=0.2.0+g82fdb402 +ARG NEURONX_CC_VERSION=2.24.5133.0+58f8de22 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.13.24727+8e870898 +ARG NEURONX_DISTRIBUTED_VERSION=0.18.27753+1cafd54f +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.9.17334+ced6ae4e +ARG NKI_VERSION=0.3.0+23928721754.g18aa1271 FROM base AS repo @@ -203,8 +208,8 @@ RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP --index-url ${PIP_REPO_URL} \ --trusted-host ${NEURON_PIP_REPO} \ --extra-index-url ${PYPI_SIMPLE_URL} \ - "neuronx-cc>=2.0" \ - "torch-neuronx==2.9.*" \ + neuronx-cc \ + torch-neuronx \ neuronx_distributed \ neuronx_distributed_inference \ nki \ @@ -236,6 +241,12 @@ RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP FROM ${BUILD_STAGE} AS final +# Upgrade OS packages to latest versions +RUN apt-get update \ + && apt-get upgrade -y \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + EXPOSE 8080 8081 ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] diff --git a/pytorch/training/2.9.0/Dockerfile.neuronx b/pytorch/training/2.9.0/Dockerfile.neuronx index cc0439b..6a64abe 100644 --- a/pytorch/training/2.9.0/Dockerfile.neuronx +++ b/pytorch/training/2.9.0/Dockerfile.neuronx @@ -117,15 +117,19 @@ ENV PATH="$PATH:/home/.openmpi/bin" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip awscliv2.zip \ + && ./aws/install \ + && rm -rf awscliv2.zip aws + RUN ${PIP} install --no-cache-dir -U \ - "bokeh>=2.3,<3" \ - "awscli<2" \ scipy \ click \ - "cryptography" \ + cryptography \ "sagemaker>=2,<3" \ - "sagemaker-pytorch-training" \ - psutil==5.6.7 \ + sagemaker-pytorch-training \ + psutil \ dataset \ Pillow \ uv \ @@ -136,26 +140,17 @@ RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pk # Copy the NxDT Installation files COPY --chmod=755 apex_setup.py nxdt_install_setup.sh nxdt_requirements.txt /root/ -# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 -# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 -# awscli 1.25.47 has requirement docutils<0.17,>=0.10 -# etcd for kubernetes installation -# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. -# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 RUN ${PIP} install --no-cache-dir -U \ - "attrs<24,>=23.1.0" \ - "docutils>=0.10,<0.17" \ - "rsa<4.8,>=3.1.2" \ - "python-etcd" \ - "urllib3>=1.26.0,<1.27" \ + attrs \ + python-etcd \ # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) && ${PIP} install --no-cache-dir -U \ - "bokeh>=3.0.1,<4" \ - "imageio>=2.22,<3" \ - "opencv-python>=4.8.1.78" \ - "plotly>=5.11,<6" \ - "seaborn>=0.12,<1" \ - "shap>=0.41,<1" \ + bokeh \ + imageio \ + opencv-python \ + plotly \ + seaborn \ + shap \ && rm -rf ~/.cache/pip/* # EFA Installer does apt get. Make sure to run apt update before that @@ -177,7 +172,7 @@ RUN apt-get update \ # Needed for running bert training scripts RUN pip3 install --no-cache-dir -U \ graphviz \ - tensorboard==2.6 \ + tensorboard \ accelerate \ # Install NxDT dependencies && ${PIP} install --no-cache-dir \ @@ -212,15 +207,15 @@ RUN mkdir -p /etc/apt/keyrings \ && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg # Neuron SDK components -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.30.59.0-f5cdefb39 -ARG NEURONX_RUNTIME_LIB_VERSION=2.30.51.0-faafe26f0 -ARG NEURONX_TOOLS_VERSION=2.28.23.0-f1c114a9d +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.31.24.0-1a31ba186 +ARG NEURONX_RUNTIME_LIB_VERSION=2.31.24.0-0b044f4ce +ARG NEURONX_TOOLS_VERSION=2.29.18.0-d5fe7ba42 -ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.12.22436+0f1dac25 -ARG NEURONX_CC_VERSION=2.23.6484.0+3b612583 -ARG NEURONX_DISTRIBUTED_VERSION=0.17.26814+4b18de63 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.13.24727+8e870898 +ARG NEURONX_CC_VERSION=2.24.5133.0+58f8de22 +ARG NEURONX_DISTRIBUTED_VERSION=0.18.27753+1cafd54f ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.7.0 -ARG NKI_VERSION=0.2.0+g82fdb402 +ARG NKI_VERSION=0.3.0+23928721754.g18aa1271 FROM base AS repo @@ -239,13 +234,20 @@ RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP --index-url ${PIP_REPO_URL} \ --trusted-host ${NEURON_PIP_REPO} \ --extra-index-url ${PYPI_SIMPLE_URL} \ - "torch-neuronx==2.9.*" \ - "neuronx-cc>=2.0" \ + torch-neuronx \ + neuronx-cc \ neuronx_distributed \ - neuronx_distributed_training \ nki \ && rm -rf ~/.cache/pip/* +# Always install neuronx_distributed_training from the prod repo +RUN ${PIP} install --no-cache-dir \ + --index-url https://pip.repos.neuron.amazonaws.com \ + --trusted-host pip.repos.neuron.amazonaws.com \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx_distributed_training \ + && rm -rf ~/.cache/pip/* + FROM base AS prod # Install Neuron components with specific versions @@ -266,15 +268,29 @@ RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ neuronx-cc==$NEURONX_CC_VERSION \ neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ - neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION \ nki==$NKI_VERSION \ && rm -rf ~/.cache/pip/* +# Always install neuronx_distributed_training from the prod repo +RUN ${PIP} install --no-cache-dir \ + --index-url https://pip.repos.neuron.amazonaws.com \ + --trusted-host pip.repos.neuron.amazonaws.com \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION \ + && rm -rf ~/.cache/pip/* + FROM ${BUILD_STAGE} AS final +# Upgrade OS packages to latest versions +RUN apt-get update \ + && apt-get upgrade -y \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + ## Installation for Neuronx Distributed Training framework # Clone and build Apex -RUN git clone https://github.com/NVIDIA/apex.git /root/apex \ +RUN --mount=type=cache,target=/root/.cache/bazel \ + git clone https://github.com/NVIDIA/apex.git /root/apex \ && cd /root/apex \ && git checkout 25.07 \ && cp /root/apex_setup.py setup.py \ @@ -282,7 +298,7 @@ RUN git clone https://github.com/NVIDIA/apex.git /root/apex \ && ${PIP} install --no-cache-dir --no-build-isolation -r /root/nxdt_requirements.txt /root/apex \ && /root/nxdt_install_setup.sh \ && ${PIP} install --force-reinstall \ - "torch==2.9.0" \ + "torch==2.9.*" \ torchvision \ && rm -rf ~/.cache/pip/* diff --git a/vllm/inference/0.16.0/Dockerfile.neuronx b/vllm/inference/0.16.0/Dockerfile.neuronx new file mode 100644 index 0000000..3edd8f7 --- /dev/null +++ b/vllm/inference/0.16.0/Dockerfile.neuronx @@ -0,0 +1,248 @@ +ARG BUILD_STAGE=prod + +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base + +LABEL dlc_major_version="1" +LABEL maintainer="Amazon AI" + +ARG DEBIAN_FRONTEND=noninteractive +ARG PIP=pip3 +ARG PYTHON=python3.12 +ARG PYTHON_VERSION=3.12.11 +ARG TORCHSERVE_VERSION=0.11.0 +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" + + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH +ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + apt-transport-https \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + ffmpeg \ + gcc \ + git \ + gnupg2 \ + gpg-agent \ + jq \ + libgl1 \ + libgl1-mesa-dri \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libcap-dev \ + libhwloc-dev \ + openssh-client \ + openjdk-11-jdk \ + unzip \ + vim \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + + +# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files +RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ + mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ + /var/lib/dpkg/info/ca-certificates-java.postinst configure; + +RUN curl -L -o ~/miniforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh \ + && chmod +x ~/miniforge.sh \ + && ~/miniforge.sh -b -p /opt/conda \ + && rm ~/miniforge.sh \ + && /opt/conda/bin/conda update -y conda \ + && /opt/conda/bin/mamba install -c conda-forge -y \ + python=$PYTHON_VERSION \ + pyopenssl \ + cython \ + mkl-include \ + mkl \ + parso \ + typing \ + # Below 2 are included in miniconda base, but not mamba so need to install + conda-content-trust \ + charset-normalizer \ + && /opt/conda/bin/conda clean -ya + +RUN /opt/conda/bin/mamba install -c conda-forge \ + python=$PYTHON_VERSION \ + scikit-learn \ + h5py \ + requests \ + && conda clean -ya \ + && pip install --upgrade pip \ + --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ + && pip install \ + enum-compat \ + ipython \ + && rm -rf ~/.cache/pip/* + +# Install EFA +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +COPY --chmod=755 vllm_entrypoint.py neuron-monitor.sh deep_learning_container.py /usr/local/bin/ + +### Mount Point ### +# When launching the container, mount the code directory to /workspace +ARG APP_MOUNT=/workspace +VOLUME [ ${APP_MOUNT} ] +WORKDIR ${APP_MOUNT}/vllm + +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip awscliv2.zip \ + && ./aws/install \ + && rm -rf awscliv2.zip aws + +RUN ${PIP} install --no-cache-dir -U \ + "opencv-python" \ + "pandas" \ + "boto3" \ + "cryptography" \ + "pytest" \ + "wheel" \ + "jinja2" \ + uv \ + torchserve==${TORCHSERVE_VERSION} \ + torch-model-archiver==${TORCHSERVE_VERSION} \ + && rm -rf ~/.cache/pip/* + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model +COPY config.properties /home/model-server + +# Compliance +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya + && rm -rf ${HOME_DIR}/.cache/conda + +# Setting up APT and PIP repo for neuron artifacts +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com +ARG NEURON_APT_REPO_KEY +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com +ARG NEURON_PIP_REPO_KEY +RUN mkdir -p /etc/apt/keyrings \ + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg + +# Neuron SDK components version numbers +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.31.24.0-1a31ba186 +ARG NEURONX_RUNTIME_LIB_VERSION=2.31.24.0-0b044f4ce +ARG NEURONX_TOOLS_VERSION=2.29.18.0-d5fe7ba42 + +ARG NEURONX_CC_VERSION=2.24.5133.0+58f8de22 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.13.24727+8e870898 +ARG NEURONX_DISTRIBUTED_VERSION=0.18.27753+1cafd54f +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.9.17334+ced6ae4e +ARG NKI_VERSION=0.3.0+23928721754.g18aa1271 + +# GitHub repository and branch +ARG GITHUB_REPO=https://github.com/vllm-project/vllm-neuron.git +ARG GITHUB_REPO_BRANCH=release-0.5.0 + +# Configure SSH access +RUN mkdir -p /root/.ssh \ + && echo "StrictHostKeyChecking no" >> /root/.ssh/config \ + && ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts + +# Clone vllm-neuronx repository +RUN --mount=type=secret,id=containers_github_ssh_key,target=/root/.ssh/id_ed25519,mode=0600 \ + git clone -b ${GITHUB_REPO_BRANCH} ${GITHUB_REPO} /opt/vllm + +FROM base AS repo + + +# Install Neuron components from the apt and pip repos (latest versions) +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-collectives \ + aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx-cc \ + torch-neuronx \ + neuronx_distributed \ + neuronx_distributed_inference \ + nki \ + -e /opt/vllm \ + && rm -rf ~/.cache/pip/* + +FROM base AS prod + +# Install Neuron components with specific versions +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx-cc==$NEURONX_CC_VERSION \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ + nki==$NKI_VERSION \ + -e /opt/vllm \ + && rm -rf ~/.cache/pip/* + +FROM ${BUILD_STAGE} AS final + +# Upgrade OS packages to latest versions +RUN apt-get update \ + && apt-get upgrade -y \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +EXPOSE 8080 8081 + +ENTRYPOINT ["python", "/usr/local/bin/vllm_entrypoint.py"] +CMD ["/bin/bash"] +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1