From cd6230124d7448dd0a069a2a6fd2084659c36df5 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 13 Sep 2023 09:17:30 -0700 Subject: [PATCH 01/23] commit --- docker/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index bfbfba0..c5d6a97 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -39,8 +39,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN pip3 install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu118 torch==2.0.1+cu118 && \ - pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com regex fire tritonclient[all] && \ +RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com regex fire tritonclient[all] && \ pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \ pip3 install --no-cache-dir cmake==3.24.3 From 25953e3afe1d318d871ddf35acc934888ba4286a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 14 Sep 2023 21:19:22 -0700 Subject: [PATCH 02/23] commit --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f86ee8a..51b0d87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,7 +114,7 @@ else() FetchContent_Declare( repo-ft GIT_REPOSITORY https://github.com/neevaco/FasterTransformer.git - GIT_TAG 23b37c7b158d05f6206119b1ef831c8e63cc1eb9 + GIT_TAG 3968bf43f687767898597c9f284033c508b42e1b GIT_SHALLOW ON ) endif() From 3c79c5f21ca66f77fe299541c9bd1d56dc0cc38e Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 14 Sep 2023 21:33:41 -0700 Subject: [PATCH 03/23] commit --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 51b0d87..9b1db2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,7 +114,7 @@ else() FetchContent_Declare( repo-ft GIT_REPOSITORY https://github.com/neevaco/FasterTransformer.git - GIT_TAG 3968bf43f687767898597c9f284033c508b42e1b + GIT_TAG 6c608049c92a51902f8e6b38ff3aca19ffc6bed4 GIT_SHALLOW ON ) endif() From 1b67215fa84c3752e811a4be2e263eeb376f9f6b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Fri, 15 Sep 2023 00:39:13 -0700 Subject: [PATCH 04/23] commit --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b1db2d..d2ec963 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,7 +114,7 @@ else() FetchContent_Declare( repo-ft GIT_REPOSITORY https://github.com/neevaco/FasterTransformer.git - GIT_TAG 6c608049c92a51902f8e6b38ff3aca19ffc6bed4 + GIT_TAG 1342ef60836bd480995501df2be2d3a5c5750ba2 GIT_SHALLOW ON ) endif() From 7ea8b63c7e3f022a0df53e0e54b6ae3fb6a437fb Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 20 Sep 2023 15:19:56 -0700 Subject: [PATCH 05/23] commit --- docker/Dockerfile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index c5d6a97..7d500da 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -43,6 +43,17 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \ pip3 install --no-cache-dir cmake==3.24.3 +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ + tar zxf openmpi-4.1.5.tar.gz && \ + cd openmpi-4.1.5 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi + # backend build ADD . /workspace/build/fastertransformer_backend RUN mkdir -p /workspace/build/fastertransformer_backend/build From fd336c8f90fbd9da20a44ff9a29dc420f1ae4e7c Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 20 Sep 2023 20:16:45 -0700 Subject: [PATCH 06/23] commit --- docker/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7d500da..c9ef240 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,6 +16,8 @@ ARG TRITON_VERSION=23.05 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 FROM ${BASE_IMAGE} +USER root + RUN apt-get update && \ apt-get install -y --no-install-recommends \ autoconf \ From 6949522bd6b96b35606fc6c16425a5268b53e012 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 20 Sep 2023 20:32:49 -0700 Subject: [PATCH 07/23] commit --- docker/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index c9ef240..71e9355 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -45,6 +45,9 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \ pip3 install --no-cache-dir cmake==3.24.3 +RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ + make -j src.build CUDA_HOME=/usr/local/cuda + RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ From 739489bc1880cdb492381bc37bb6c838a5fe8de0 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 20 Sep 2023 20:33:15 -0700 Subject: [PATCH 08/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 71e9355..e4a0ca8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -47,7 +47,7 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ make -j src.build CUDA_HOME=/usr/local/cuda - + RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ From 8141b4f3fb9e40943b9d225b8312782781179272 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 02:24:25 -0700 Subject: [PATCH 09/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e4a0ca8..17e25f6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -46,7 +46,7 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re pip3 install --no-cache-dir cmake==3.24.3 RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ - make -j src.build CUDA_HOME=/usr/local/cuda + make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ From ec09c05019d6aecc9b80552e3785af8c2903b651 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 14:16:57 -0700 Subject: [PATCH 10/23] commit --- docker/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 17e25f6..798daee 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -69,6 +69,7 @@ RUN CUDAFLAGS="-include stdio.h" cmake \ -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ -D CMAKE_BUILD_TYPE=Release \ -D ENABLE_FP8=OFF \ + -D CMAKE_PREFIX_PATH=/usr/local/mpi \ -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ From 137ec3d93a69c85a8b7b697958964b90d3e58da4 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 17:04:18 -0700 Subject: [PATCH 11/23] commit --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d2ec963..5bc934e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) # Python.h needed by torch headers. find_package(Python3 REQUIRED COMPONENTS Development) +set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/local/mpi/lib) find_package(CUDA 10.1 REQUIRED) if (BUILD_MULTI_GPU) From 46feca9355fcdcd06431b3d7d7ca6d6264c8f808 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 17:05:07 -0700 Subject: [PATCH 12/23] commit --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bc934e..d2ec963 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,7 +57,6 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) # Python.h needed by torch headers. find_package(Python3 REQUIRED COMPONENTS Development) -set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/local/mpi/lib) find_package(CUDA 10.1 REQUIRED) if (BUILD_MULTI_GPU) From 6a653e570fc30a5d282d60bf5e015de47ad13936 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 19:35:30 -0700 Subject: [PATCH 13/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 798daee..91be0c7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -69,7 +69,7 @@ RUN CUDAFLAGS="-include stdio.h" cmake \ -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ -D CMAKE_BUILD_TYPE=Release \ -D ENABLE_FP8=OFF \ - -D CMAKE_PREFIX_PATH=/usr/local/mpi \ + -D CMAKE_PREFIX_PATH=/usr/local/mpi/ \ -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ From 5744abf5a58fe7aa1858f2ede879f46cc3e37cdc Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 23:16:18 -0700 Subject: [PATCH 14/23] commit --- docker/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 91be0c7..8837538 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -48,6 +48,8 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr +RUN rm -rf /opt/hpcx + RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ From 30553e174cc46589ce955e9df8bb6431a4e91953 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 23:32:21 -0700 Subject: [PATCH 15/23] commit --- docker/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8837538..f2eae6f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -48,7 +48,10 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr -RUN rm -rf /opt/hpcx +RUN rm -rf /opt/hpcx && \ + wget https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ + tar -xvf hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ + cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz/ /opt/hpcx/ RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ From 9ac716ad4cb91fd5e11c3d0d2fcd14a28d76bb76 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 21 Sep 2023 23:32:35 -0700 Subject: [PATCH 16/23] commit --- docker/Dockerfile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f2eae6f..ab21249 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -53,16 +53,16 @@ RUN rm -rf /opt/hpcx && \ tar -xvf hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz/ /opt/hpcx/ -RUN mkdir /tmp/openmpi && \ - cd /tmp/openmpi && \ - wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ - tar zxf openmpi-4.1.5.tar.gz && \ - cd openmpi-4.1.5 && \ - ./configure --enable-orterun-prefix-by-default && \ - make -j $(nproc) all && \ - make install && \ - ldconfig && \ - rm -rf /tmp/openmpi +# RUN mkdir /tmp/openmpi && \ +# cd /tmp/openmpi && \ +# wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ +# tar zxf openmpi-4.1.5.tar.gz && \ +# cd openmpi-4.1.5 && \ +# ./configure --enable-orterun-prefix-by-default && \ +# make -j $(nproc) all && \ +# make install && \ +# ldconfig && \ +# rm -rf /tmp/openmpi # backend build ADD . /workspace/build/fastertransformer_backend @@ -74,7 +74,7 @@ RUN CUDAFLAGS="-include stdio.h" cmake \ -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ -D CMAKE_BUILD_TYPE=Release \ -D ENABLE_FP8=OFF \ - -D CMAKE_PREFIX_PATH=/usr/local/mpi/ \ + # -D CMAKE_PREFIX_PATH=/usr/local/mpi/ \ -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ From f740c58b2d424a936822fdf99885b4358f4989ab Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Fri, 22 Sep 2023 09:22:25 -0700 Subject: [PATCH 17/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index ab21249..744818b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -51,7 +51,7 @@ RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ RUN rm -rf /opt/hpcx && \ wget https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ tar -xvf hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ - cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz/ /opt/hpcx/ + cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ /opt/hpcx/ # RUN mkdir /tmp/openmpi && \ # cd /tmp/openmpi && \ From 923354ed40b10c185dc234063559931840f8fe85 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Fri, 22 Sep 2023 15:41:42 -0700 Subject: [PATCH 18/23] commit --- docker/Dockerfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 744818b..48b6fbb 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG TRITON_VERSION=23.05 +ARG TRITON_VERSION=23.08 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 FROM ${BASE_IMAGE} @@ -45,13 +45,13 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \ pip3 install --no-cache-dir cmake==3.24.3 -RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ - make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr +# RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ +# make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr -RUN rm -rf /opt/hpcx && \ - wget https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ - tar -xvf hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ - cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ /opt/hpcx/ +# RUN rm -rf /opt/hpcx && \ +# wget https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ +# tar -xvf hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ +# cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ /opt/hpcx/ # RUN mkdir /tmp/openmpi && \ # cd /tmp/openmpi && \ From afaeff9525bdd05fea9a600ba681baf03960c912 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 23 Sep 2023 19:31:39 -0700 Subject: [PATCH 19/23] commit --- docker/Dockerfile | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 48b6fbb..04cc98e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -45,24 +45,26 @@ RUN pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com re pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \ pip3 install --no-cache-dir cmake==3.24.3 -# RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ -# make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr +RUN rm -rf /opt/hpcx + +RUN git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ + make -j src.build CUDA_HOME=/usr/local/cuda BUILDDIR=/usr # RUN rm -rf /opt/hpcx && \ # wget https://content.mellanox.com/hpc/hpc-x/v2.16/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ # tar -xvf hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz && \ # cp -r hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ /opt/hpcx/ -# RUN mkdir /tmp/openmpi && \ -# cd /tmp/openmpi && \ -# wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ -# tar zxf openmpi-4.1.5.tar.gz && \ -# cd openmpi-4.1.5 && \ -# ./configure --enable-orterun-prefix-by-default && \ -# make -j $(nproc) all && \ -# make install && \ -# ldconfig && \ -# rm -rf /tmp/openmpi +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ + tar zxf openmpi-4.1.5.tar.gz && \ + cd openmpi-4.1.5 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi # backend build ADD . /workspace/build/fastertransformer_backend @@ -74,7 +76,7 @@ RUN CUDAFLAGS="-include stdio.h" cmake \ -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ -D CMAKE_BUILD_TYPE=Release \ -D ENABLE_FP8=OFF \ - # -D CMAKE_PREFIX_PATH=/usr/local/mpi/ \ + -D CMAKE_PREFIX_PATH=/usr/local/mpi/ \ -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ From 2f458703098b8490805fab8da3234f6d141b0b5b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 23 Sep 2023 19:40:07 -0700 Subject: [PATCH 20/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 04cc98e..a9685a2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG TRITON_VERSION=23.08 +ARG TRITON_VERSION=23.06 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 FROM ${BASE_IMAGE} From 5d7bd3f5671cb0c1b5b351ce40190f543ed390eb Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sat, 23 Sep 2023 20:49:27 -0700 Subject: [PATCH 21/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a9685a2..61143bc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -76,7 +76,7 @@ RUN CUDAFLAGS="-include stdio.h" cmake \ -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ -D CMAKE_BUILD_TYPE=Release \ -D ENABLE_FP8=OFF \ - -D CMAKE_PREFIX_PATH=/usr/local/mpi/ \ + -D MPI_HOME=/usr/local/mpi/ \ -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ From d753bda279bed49616616a730473093417555d23 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Sun, 24 Sep 2023 12:00:08 -0700 Subject: [PATCH 22/23] commit --- docker/Dockerfile | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 61143bc..077be4f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -71,28 +71,28 @@ ADD . /workspace/build/fastertransformer_backend RUN mkdir -p /workspace/build/fastertransformer_backend/build WORKDIR /workspace/build/fastertransformer_backend/build -ARG FORCE_BACKEND_REBUILD=0 -RUN CUDAFLAGS="-include stdio.h" cmake \ - -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ - -D CMAKE_BUILD_TYPE=Release \ - -D ENABLE_FP8=OFF \ - -D MPI_HOME=/usr/local/mpi/ \ - -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ - -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ - -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ - -D TRITON_BACKEND_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ - .. && \ - cd _deps/repo-ft-src/ && \ - git log | head -n 3 2>&1 | tee /workspace/build/fastertransformer_backend/FT_version.txt && \ - cd /workspace/build/fastertransformer_backend/build && \ - CUDAFLAGS="-include stdio.h" make -O -j"$(grep -c ^processor /proc/cpuinfo)" install && \ - rm /workspace/build/fastertransformer_backend/build/bin/*_example -rf && \ - rm /workspace/build/fastertransformer_backend/build/lib/lib*Backend.so -rf +# ARG FORCE_BACKEND_REBUILD=0 +# RUN CUDAFLAGS="-include stdio.h" cmake \ +# -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ +# -D CMAKE_BUILD_TYPE=Release \ +# -D ENABLE_FP8=OFF \ +# -D MPI_HOME=/usr/local/mpi/ \ +# -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ +# -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ +# -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ +# -D TRITON_BACKEND_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ +# .. && \ +# cd _deps/repo-ft-src/ && \ +# git log | head -n 3 2>&1 | tee /workspace/build/fastertransformer_backend/FT_version.txt && \ +# cd /workspace/build/fastertransformer_backend/build && \ +# CUDAFLAGS="-include stdio.h" make -O -j"$(grep -c ^processor /proc/cpuinfo)" install && \ +# rm /workspace/build/fastertransformer_backend/build/bin/*_example -rf && \ +# rm /workspace/build/fastertransformer_backend/build/lib/lib*Backend.so -rf -ENV NCCL_LAUNCH_MODE=GROUP -ENV WORKSPACE /workspace -WORKDIR /workspace +# ENV NCCL_LAUNCH_MODE=GROUP +# ENV WORKSPACE /workspace +# WORKDIR /workspace -RUN sed -i 's/#X11UseLocalhost yes/X11UseLocalhost no/g' /etc/ssh/sshd_config && \ - mkdir /var/run/sshd -p +# RUN sed -i 's/#X11UseLocalhost yes/X11UseLocalhost no/g' /etc/ssh/sshd_config && \ +# mkdir /var/run/sshd -p From 7cc94cbe8c9768b0f25f80cb1bb2b31f7235c811 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Mon, 25 Sep 2023 20:28:37 -0700 Subject: [PATCH 23/23] commit --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 077be4f..0af0858 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -60,7 +60,7 @@ RUN mkdir /tmp/openmpi && \ wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz && \ tar zxf openmpi-4.1.5.tar.gz && \ cd openmpi-4.1.5 && \ - ./configure --enable-orterun-prefix-by-default && \ + ./configure --enable-orterun-prefix-by-default --prefix=/usr/local/opt/openmpi && \ make -j $(nproc) all && \ make install && \ ldconfig && \