diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..4fbf6af --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @sfc-gh-zhwang @sfc-gh-hykim \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b1eba0..c65b596 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,9 +113,8 @@ if (EXISTS ${FT_DIR}) else() FetchContent_Declare( repo-ft - GIT_REPOSITORY https://github.com/NVIDIA/FasterTransformer.git - GIT_TAG main - GIT_SHALLOW ON + GIT_REPOSITORY https://github.com/neevaco/FasterTransformer.git + GIT_TAG b6b21406449ab19f00d1d5f97338065037b5f8e3 ) endif() FetchContent_MakeAvailable(repo-common repo-core repo-backend repo-ft) diff --git a/LEGAL.md b/LEGAL.md new file mode 100644 index 0000000..e52a5f1 --- /dev/null +++ b/LEGAL.md @@ -0,0 +1 @@ +This application is not part of the Snowflake Service and is governed by the terms in LICENSE, unless expressly agreed to in writing. You use this application at your own risk, and Snowflake has no obligation to support your use of this application. diff --git a/docker/Dockerfile b/docker/Dockerfile index bfbfba0..2768efc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -42,7 +42,9 @@ RUN apt-get update && \ RUN pip3 install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cu118 torch==2.0.1+cu118 && \ pip3 install --no-cache-dir --extra-index-url https://pypi.ngc.nvidia.com regex fire tritonclient[all] && \ pip3 install --no-cache-dir accelerate transformers huggingface_hub tokenizers SentencePiece sacrebleu datasets tqdm omegaconf rouge_score && \ - pip3 install --no-cache-dir cmake==3.24.3 + pip3 install --no-cache-dir cmake==3.24.3 && \ + pip3 install --no-cache-dir langid==1.1.6 && \ + pip3 install --no-cache-dir lingua-language-detector==2.0.2 # backend build ADD . /workspace/build/fastertransformer_backend @@ -66,6 +68,11 @@ RUN CUDAFLAGS="-include stdio.h" cmake \ rm /workspace/build/fastertransformer_backend/build/bin/*_example -rf && \ rm /workspace/build/fastertransformer_backend/build/lib/lib*Backend.so -rf +# Removing git because of CVEs, no longer needed after build +RUN apt-get purge git git-man -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + ENV NCCL_LAUNCH_MODE=GROUP ENV WORKSPACE /workspace WORKDIR /workspace diff --git a/src/libfastertransformer.cc b/src/libfastertransformer.cc index 6076d1f..06ec7a0 100644 --- a/src/libfastertransformer.cc +++ b/src/libfastertransformer.cc @@ -49,10 +49,14 @@ // FT's libraries have dependency with triton's lib #include "src/fastertransformer/triton_backend/bert/BertTritonModel.h" +#include "src/fastertransformer/triton_backend/bart/BartTritonModel.h" +#include "src/fastertransformer/triton_backend/m2m/M2MTritonModel.h" +#include "src/fastertransformer/triton_backend/deberta/DebertaTritonModel.h" #include "src/fastertransformer/triton_backend/gptj/GptJTritonModel.h" #include "src/fastertransformer/triton_backend/gptj/GptJTritonModelInstance.h" #include "src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModel.h" #include "src/fastertransformer/triton_backend/gptneox/GptNeoXTritonModelInstance.h" +#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" #include "src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h" #include "src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.h" #include "src/fastertransformer/triton_backend/t5/T5TritonModel.h" @@ -327,6 +331,63 @@ std::shared_ptr ModelState::ModelFactory( } else if (data_type == "bf16") { ft_model = std::make_shared>( tp, pp, custom_ar, model_dir, int8_mode, is_sparse, remove_padding); +#endif + } + } else if (model_type == "llama") { + const int int8_mode = param_get_int(param, "int8_mode"); + + if (data_type == "fp16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, int8_mode); + } else if (data_type == "fp32") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, int8_mode); +#ifdef ENABLE_BF16 + } else if (data_type == "bf16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, int8_mode); +#endif + } + } else if (model_type == "bart") { + if (data_type == "fp16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, 0); + } else if (data_type == "fp32") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, 0); +#ifdef ENABLE_BF16 + } else if (data_type == "bf16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, 0); +#endif + } + } else if (model_type == "m2m") { + if (data_type == "fp16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, 0); + } else if (data_type == "fp32") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, 0); +#ifdef ENABLE_BF16 + } else if (data_type == "bf16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, 0); +#endif + } + } else if (model_type == "deberta") { + const int is_sparse = param_get_bool(param,"is_sparse", false); + const int remove_padding = param_get_bool(param,"is_remove_padding", false); + + if (data_type == "fp16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, is_sparse, remove_padding); + } else if (data_type == "fp32") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, is_sparse, remove_padding); +#ifdef ENABLE_BF16 + } else if (data_type == "bf16") { + ft_model = std::make_shared>( + tp, pp, custom_ar, model_dir, is_sparse, remove_padding); #endif } } else {